Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion app/ai/voice/agents/breeze_buddy/template/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ class TTSProvider(str, Enum):
ELEVENLABS = "elevenlabs"
CARTESIA = "cartesia"
SARVAM = "sarvam"
SONIOX = "soniox"


# Maps legacy tts_voice_name values to current provider strings for backward compat.
Expand Down Expand Up @@ -317,10 +318,18 @@ class TTSConfig(BaseModel):
"speed": 0.9,
"pitch": 0.0
}

Example (Soniox):
{
"provider": "soniox",
"voice_id": "Adrian",
"model": "tts-rt-v1-preview",
"language": "en"
}
"""

provider: TTSProvider = Field(
..., description="TTS provider (elevenlabs, cartesia, sarvam)"
..., description="TTS provider (elevenlabs, cartesia, sarvam, soniox)"
)
voice_id: Optional[str] = Field(None, description="Provider-specific voice ID")
model: Optional[str] = Field(
Expand Down
28 changes: 28 additions & 0 deletions app/ai/voice/agents/breeze_buddy/tts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@
CartesiaConfig,
ElevenLabsConfig,
SarvamTTSConfig,
SonioxTTSConfig,
build_cartesia_tts,
build_elevenlabs_tts,
build_sarvam_tts,
build_soniox_tts,
)
from app.ai.voice.tts.cartesia import _generate_cartesia_audio
from app.ai.voice.tts.elevenlabs import _generate_elevenlabs_audio
from app.ai.voice.tts.sarvam import _generate_sarvam_audio
from app.ai.voice.tts.soniox import _generate_soniox_audio
from app.core.config.dynamic import (
BB_AGGREGATE_SENTENCES,
BB_ENABLE_ELEVENLABS_INDIAN_RESIDENCY,
Expand All @@ -33,6 +36,7 @@
ELEVENLABS_INDIAN_RESIDENCY_API_KEY,
ELEVENLABS_INDIAN_RESIDENCY_WEBSOCKET_URL,
SARVAM_API_KEY,
SONIOX_API_KEY,
)
from app.core.logger import logger

Expand Down Expand Up @@ -187,6 +191,22 @@ async def get_tts_service(voice_config: TTSConfig):
)
)

elif provider == "soniox":
if not SONIOX_API_KEY:
raise ValueError("SONIOX_API_KEY is required for Soniox TTS")

aggregate = await BB_AGGREGATE_SENTENCES("soniox")

return build_soniox_tts(
SonioxTTSConfig(
api_key=SONIOX_API_KEY,
voice=voice_config.voice_id or "Adrian",
model=voice_config.model or "tts-rt-v1-preview",
language=_parse_language(voice_config.language, Language.EN),
aggregate_sentences=aggregate,
)
)

else:
raise ValueError(f"Unsupported TTS provider: {provider}")

Expand Down Expand Up @@ -239,6 +259,14 @@ async def generate_audio(
model=resolved.model,
)
input_format = "raw"
elif provider == "soniox":
audio_data = await _generate_soniox_audio(
text=text,
voice=resolved.voice_id,
model=resolved.model,
language=resolved.language,
)
input_format = "raw"
Comment on lines +262 to +269
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Language lookup inconsistency between streaming and one-shot paths.

get_tts_service uses _parse_language (key-based: Language[code.upper().replace("-", "_")]) for robustness, but generate_audio forwards resolved.language as a raw string to _generate_soniox_audio which applies value-based Language(language). These resolve identically for lowercase BCP 47 codes ("en", "hi"), but diverge for uppercase inputs ("EN", "EN_IN"): the value-based path silently falls back to Language.EN, while the streaming path would correctly map to the intended enum member.

🛠️ Proposed fix — pre-parse with _parse_language before forwarding
-    elif provider == "soniox":
-        audio_data = await _generate_soniox_audio(
-            text=text,
-            voice=resolved.voice_id,
-            model=resolved.model,
-            language=resolved.language,
-        )
-        input_format = "raw"
+    elif provider == "soniox":
+        audio_data = await _generate_soniox_audio(
+            text=text,
+            voice=resolved.voice_id,
+            model=resolved.model,
+            language=_parse_language(resolved.language, Language.EN),
+        )
+        input_format = "raw"

This also requires updating _generate_soniox_audio's language parameter type from Optional[str] to Optional[Language] to skip re-parsing internally.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@app/ai/voice/agents/breeze_buddy/tts/__init__.py` around lines 262 - 269, The
streaming vs one-shot language handling is inconsistent: update the
generate_audio call site to pre-parse resolved.language using _parse_language
(the same logic get_tts_service uses) before passing it to
_generate_soniox_audio, and change _generate_soniox_audio's language parameter
type from Optional[str] to Optional[Language] so it skips internal value-based
parsing; reference get_tts_service, _parse_language, generate_audio,
_generate_soniox_audio, resolved.language and the Language enum when making the
change.

else:
raise ValueError(f"Unsupported TTS provider: {provider}")

Expand Down
4 changes: 4 additions & 0 deletions app/ai/voice/tts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .elevenlabs import ElevenLabsConfig, build_elevenlabs_tts
from .google import GoogleConfig, build_google_tts
from .sarvam import SarvamTTSConfig, build_sarvam_tts, get_sarvam_language
from .soniox import SonioxTTSConfig, build_soniox_tts

__all__ = [
# Cartesia
Expand All @@ -25,4 +26,7 @@
"SarvamTTSConfig",
"build_sarvam_tts",
"get_sarvam_language",
# Soniox
"SonioxTTSConfig",
"build_soniox_tts",
]
158 changes: 158 additions & 0 deletions app/ai/voice/tts/soniox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
"""Soniox TTS helpers and builder.

Wraps pipecat's :class:`SonioxTTSService` (WebSocket streaming TTS) to fit the
shared builder pattern used by other providers in this package, and provides a
one-shot WebSocket synth helper for greeting prep — pipecat's Soniox client
is streaming-only, so the one-shot path is implemented directly against the
same WebSocket protocol.
"""

from __future__ import annotations

import base64
import json
from dataclasses import dataclass
from typing import Optional

from pipecat.services.soniox.tts import (
SonioxTTSService,
language_to_soniox_tts_language,
)
from pipecat.services.tts_service import TextAggregationMode
from pipecat.transcriptions.language import Language
from websockets.asyncio.client import connect as websocket_connect

from app.core.config.static import SONIOX_API_KEY
from app.core.logger import logger

__all__ = [
"SonioxTTSConfig",
"build_soniox_tts",
"_generate_soniox_audio",
]


SONIOX_TTS_WS_URL = "wss://tts-rt.soniox.com/tts-websocket"


@dataclass
class SonioxTTSConfig:
"""Configuration for Soniox TTS."""

api_key: str
voice: str
model: str
language: Language = Language.EN
sample_rate: int = 16000
audio_format: str = "pcm_s16le"
aggregate_sentences: bool = True


def build_soniox_tts(config: SonioxTTSConfig) -> SonioxTTSService:
"""Create a Soniox TTS service.

Pipecat handles Language enum -> Soniox language code conversion at
init time via ``language_to_service_language``, so the enum is forwarded
untouched.
"""

logger.info(
f"Using SonioxTTSService with model={config.model}, voice={config.voice}, "
f"language={config.language}, sample_rate={config.sample_rate}, "
f"audio_format={config.audio_format}"
)

return SonioxTTSService(
api_key=config.api_key,
sample_rate=config.sample_rate,
audio_format=config.audio_format,
settings=SonioxTTSService.Settings(
voice=config.voice,
model=config.model,
language=config.language,
),
text_aggregation_mode=(
TextAggregationMode.SENTENCE
if config.aggregate_sentences
else TextAggregationMode.TOKEN
),
)


async def _generate_soniox_audio(
text: str,
voice: Optional[str] = None,
model: Optional[str] = None,
language: Optional[str] = None,
sample_rate: int = 16000,
) -> bytes:
"""One-shot synth via Soniox WebSocket for greeting prep.

Opens a single WebSocket, sends config + text + ``text_end:true``, collects
base64-encoded audio chunks until ``terminated``, and returns the
concatenated PCM bytes.

Returns 16-bit little-endian PCM mono at the requested ``sample_rate``,
matching ``convert_to_mulaw`` expectations for downstream telephony use.
Comment on lines +86 to +96
"""
if not SONIOX_API_KEY:
raise ValueError("SONIOX_API_KEY is required for Soniox TTS")

voice = voice or "Adrian"
model = model or "tts-rt-v1-preview"

if language:
try:
lang_enum = Language(language)
except ValueError:
logger.warning(
f"Invalid Soniox language code '{language}', falling back to EN"
)
lang_enum = Language.EN
else:
lang_enum = Language.EN

soniox_lang = language_to_soniox_tts_language(lang_enum) or "en"

config_msg = {
"api_key": SONIOX_API_KEY,
"stream_id": "greeting",
"model": model,
"voice": voice,
"language": soniox_lang,
"audio_format": "pcm_s16le",
"sample_rate": sample_rate,
}
text_msg = {"text": text, "text_end": True, "stream_id": "greeting"}

logger.info(
f"Synthesizing greeting with Soniox (pcm_s16le {sample_rate}): {text[:50]}..."
)
Comment on lines +128 to +130
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

PII exposure risk: greeting text logged verbatim.

By the time _generate_soniox_audio is called, template variables (e.g., {{customer_name}}) are already substituted, so text[:50] can contain customer names. Per project guidelines, logging sensitive data is a major compliance risk.

🛡️ Proposed fix — log metadata only
-    logger.info(
-        f"Synthesizing greeting with Soniox (pcm_s16le {sample_rate}): {text[:50]}..."
-    )
+    logger.info(
+        f"Synthesizing greeting with Soniox (pcm_s16le {sample_rate}), "
+        f"text_length={len(text)} chars"
+    )
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@app/ai/voice/tts/soniox.py` around lines 128 - 130, The current logger.info
in _generate_soniox_audio exposes substituted template text (PII); remove
logging of text[:50] and instead log only non-PII metadata—e.g., sample_rate,
voice/id, text length, and a redacted or hashed fingerprint if you need
traceability—and update the logger.info call in _generate_soniox_audio to output
those safe fields only so no customer-sensitive content is written to logs.


audio_chunks: list[bytes] = []
async with websocket_connect(SONIOX_TTS_WS_URL) as ws:
await ws.send(json.dumps(config_msg))
await ws.send(json.dumps(text_msg))

async for raw in ws:
try:
msg = json.loads(raw)
except json.JSONDecodeError:
continue

error_code = msg.get("error_code")
if error_code is not None:
error_message = msg.get("error_message", "")
raise Exception(f"Soniox TTS error {error_code}: {error_message}")

audio_b64 = msg.get("audio")
if audio_b64:
audio_chunks.append(base64.b64decode(audio_b64))

if msg.get("terminated"):
break
Comment on lines +133 to +153
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

No overall timeout on the WebSocket receive loop.

open_timeout=10 handles connection establishment, but once connected, async for raw in ws: blocks until either terminated=True arrives or the keepalive mechanism fires (~40 s at the default ping_interval=20 + ping_timeout=20). A silent Soniox-side hang stalls greeting preparation — and therefore call startup — for up to 40 seconds.

The websockets library itself recommends asyncio.timeout() (Python ≥ 3.11) for per-receive timeouts, and the project requires Python 3.11+.

⏱️ Proposed fix — add asyncio.timeout around the WS block
+import asyncio
 ...
 async def _generate_soniox_audio(
     text: str,
     voice: Optional[str] = None,
     model: Optional[str] = None,
     language: Optional[str] = None,
     sample_rate: int = 16000,
+    timeout_secs: float = 30.0,
 ) -> bytes:
     ...
-    async with websocket_connect(SONIOX_TTS_WS_URL) as ws:
-        await ws.send(json.dumps(config_msg))
-        await ws.send(json.dumps(text_msg))
-
-        async for raw in ws:
-            ...
+    try:
+        async with asyncio.timeout(timeout_secs):
+            async with websocket_connect(SONIOX_TTS_WS_URL) as ws:
+                await ws.send(json.dumps(config_msg))
+                await ws.send(json.dumps(text_msg))
+
+                async for raw in ws:
+                    ...
+    except TimeoutError:
+        raise Exception(
+            f"Soniox TTS timed out after {timeout_secs}s waiting for audio"
+        )
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@app/ai/voice/tts/soniox.py` around lines 133 - 153, The WebSocket receive
loop opened with websocket_connect(SONIOX_TTS_WS_URL) has no overall receive
timeout; wrap the receive/processing block (the async for raw in ws: loop that
decodes messages, checks error_code, collects audio_chunks, and breaks on
msg.get("terminated")) in an asyncio.timeout(...) context (e.g., configurable
seconds) so a silent Soniox hang raises asyncio.TimeoutError; on timeout
cancel/close the ws and raise an informative exception so callers know the TTS
request failed instead of hanging indefinitely. Ensure the timeout is applied
after sending config_msg and text_msg and that you still handle JSONDecodeError
and existing Soniox error_code logic inside the timeout.


if not audio_chunks:
raise Exception("No audio returned from Soniox TTS")

return b"".join(audio_chunks)
5 changes: 5 additions & 0 deletions app/core/config/dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@
"speed": 0.9,
"pitch": 0.0,
},
"soniox": {
"voice_id": "Adrian",
"model": "tts-rt-v1-preview",
"language": "en",
},
}


Expand Down
Loading