-
Notifications
You must be signed in to change notification settings - Fork 57
feat(tts): add Soniox TTS provider end-to-end #748
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: release
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,158 @@ | ||
| """Soniox TTS helpers and builder. | ||
|
|
||
| Wraps pipecat's :class:`SonioxTTSService` (WebSocket streaming TTS) to fit the | ||
| shared builder pattern used by other providers in this package, and provides a | ||
| one-shot WebSocket synth helper for greeting prep — pipecat's Soniox client | ||
| is streaming-only, so the one-shot path is implemented directly against the | ||
| same WebSocket protocol. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import base64 | ||
| import json | ||
| from dataclasses import dataclass | ||
| from typing import Optional | ||
|
|
||
| from pipecat.services.soniox.tts import ( | ||
| SonioxTTSService, | ||
| language_to_soniox_tts_language, | ||
| ) | ||
| from pipecat.services.tts_service import TextAggregationMode | ||
| from pipecat.transcriptions.language import Language | ||
| from websockets.asyncio.client import connect as websocket_connect | ||
|
|
||
| from app.core.config.static import SONIOX_API_KEY | ||
| from app.core.logger import logger | ||
|
|
||
| __all__ = [ | ||
| "SonioxTTSConfig", | ||
| "build_soniox_tts", | ||
| "_generate_soniox_audio", | ||
| ] | ||
|
|
||
|
|
||
| SONIOX_TTS_WS_URL = "wss://tts-rt.soniox.com/tts-websocket" | ||
|
|
||
|
|
||
| @dataclass | ||
| class SonioxTTSConfig: | ||
| """Configuration for Soniox TTS.""" | ||
|
|
||
| api_key: str | ||
| voice: str | ||
| model: str | ||
| language: Language = Language.EN | ||
| sample_rate: int = 16000 | ||
| audio_format: str = "pcm_s16le" | ||
| aggregate_sentences: bool = True | ||
|
|
||
|
|
||
| def build_soniox_tts(config: SonioxTTSConfig) -> SonioxTTSService: | ||
| """Create a Soniox TTS service. | ||
|
|
||
| Pipecat handles Language enum -> Soniox language code conversion at | ||
| init time via ``language_to_service_language``, so the enum is forwarded | ||
| untouched. | ||
| """ | ||
|
|
||
| logger.info( | ||
| f"Using SonioxTTSService with model={config.model}, voice={config.voice}, " | ||
| f"language={config.language}, sample_rate={config.sample_rate}, " | ||
| f"audio_format={config.audio_format}" | ||
| ) | ||
|
|
||
| return SonioxTTSService( | ||
| api_key=config.api_key, | ||
| sample_rate=config.sample_rate, | ||
| audio_format=config.audio_format, | ||
| settings=SonioxTTSService.Settings( | ||
| voice=config.voice, | ||
| model=config.model, | ||
| language=config.language, | ||
| ), | ||
| text_aggregation_mode=( | ||
| TextAggregationMode.SENTENCE | ||
| if config.aggregate_sentences | ||
| else TextAggregationMode.TOKEN | ||
| ), | ||
| ) | ||
|
|
||
|
|
||
| async def _generate_soniox_audio( | ||
| text: str, | ||
| voice: Optional[str] = None, | ||
| model: Optional[str] = None, | ||
| language: Optional[str] = None, | ||
| sample_rate: int = 16000, | ||
| ) -> bytes: | ||
| """One-shot synth via Soniox WebSocket for greeting prep. | ||
|
|
||
| Opens a single WebSocket, sends config + text + ``text_end:true``, collects | ||
| base64-encoded audio chunks until ``terminated``, and returns the | ||
| concatenated PCM bytes. | ||
|
|
||
| Returns 16-bit little-endian PCM mono at the requested ``sample_rate``, | ||
| matching ``convert_to_mulaw`` expectations for downstream telephony use. | ||
|
Comment on lines
+86
to
+96
|
||
| """ | ||
| if not SONIOX_API_KEY: | ||
| raise ValueError("SONIOX_API_KEY is required for Soniox TTS") | ||
|
|
||
| voice = voice or "Adrian" | ||
| model = model or "tts-rt-v1-preview" | ||
|
|
||
| if language: | ||
| try: | ||
| lang_enum = Language(language) | ||
| except ValueError: | ||
| logger.warning( | ||
| f"Invalid Soniox language code '{language}', falling back to EN" | ||
| ) | ||
| lang_enum = Language.EN | ||
| else: | ||
| lang_enum = Language.EN | ||
|
|
||
| soniox_lang = language_to_soniox_tts_language(lang_enum) or "en" | ||
|
|
||
| config_msg = { | ||
| "api_key": SONIOX_API_KEY, | ||
| "stream_id": "greeting", | ||
| "model": model, | ||
| "voice": voice, | ||
| "language": soniox_lang, | ||
| "audio_format": "pcm_s16le", | ||
| "sample_rate": sample_rate, | ||
| } | ||
| text_msg = {"text": text, "text_end": True, "stream_id": "greeting"} | ||
|
|
||
| logger.info( | ||
| f"Synthesizing greeting with Soniox (pcm_s16le {sample_rate}): {text[:50]}..." | ||
| ) | ||
|
Comment on lines
+128
to
+130
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PII exposure risk: greeting text logged verbatim. By the time 🛡️ Proposed fix — log metadata only- logger.info(
- f"Synthesizing greeting with Soniox (pcm_s16le {sample_rate}): {text[:50]}..."
- )
+ logger.info(
+ f"Synthesizing greeting with Soniox (pcm_s16le {sample_rate}), "
+ f"text_length={len(text)} chars"
+ )🤖 Prompt for AI Agents |
||
|
|
||
| audio_chunks: list[bytes] = [] | ||
| async with websocket_connect(SONIOX_TTS_WS_URL) as ws: | ||
| await ws.send(json.dumps(config_msg)) | ||
| await ws.send(json.dumps(text_msg)) | ||
|
|
||
| async for raw in ws: | ||
| try: | ||
| msg = json.loads(raw) | ||
| except json.JSONDecodeError: | ||
| continue | ||
|
|
||
| error_code = msg.get("error_code") | ||
| if error_code is not None: | ||
| error_message = msg.get("error_message", "") | ||
| raise Exception(f"Soniox TTS error {error_code}: {error_message}") | ||
|
|
||
| audio_b64 = msg.get("audio") | ||
| if audio_b64: | ||
| audio_chunks.append(base64.b64decode(audio_b64)) | ||
|
|
||
| if msg.get("terminated"): | ||
| break | ||
|
Comment on lines
+133
to
+153
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No overall timeout on the WebSocket receive loop.
The ⏱️ Proposed fix — add asyncio.timeout around the WS block+import asyncio
...
async def _generate_soniox_audio(
text: str,
voice: Optional[str] = None,
model: Optional[str] = None,
language: Optional[str] = None,
sample_rate: int = 16000,
+ timeout_secs: float = 30.0,
) -> bytes:
...
- async with websocket_connect(SONIOX_TTS_WS_URL) as ws:
- await ws.send(json.dumps(config_msg))
- await ws.send(json.dumps(text_msg))
-
- async for raw in ws:
- ...
+ try:
+ async with asyncio.timeout(timeout_secs):
+ async with websocket_connect(SONIOX_TTS_WS_URL) as ws:
+ await ws.send(json.dumps(config_msg))
+ await ws.send(json.dumps(text_msg))
+
+ async for raw in ws:
+ ...
+ except TimeoutError:
+ raise Exception(
+ f"Soniox TTS timed out after {timeout_secs}s waiting for audio"
+ )🤖 Prompt for AI Agents |
||
|
|
||
| if not audio_chunks: | ||
| raise Exception("No audio returned from Soniox TTS") | ||
|
|
||
| return b"".join(audio_chunks) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Language lookup inconsistency between streaming and one-shot paths.
get_tts_serviceuses_parse_language(key-based:Language[code.upper().replace("-", "_")]) for robustness, butgenerate_audioforwardsresolved.languageas a raw string to_generate_soniox_audiowhich applies value-basedLanguage(language). These resolve identically for lowercase BCP 47 codes ("en","hi"), but diverge for uppercase inputs ("EN","EN_IN"): the value-based path silently falls back toLanguage.EN, while the streaming path would correctly map to the intended enum member.🛠️ Proposed fix — pre-parse with _parse_language before forwarding
This also requires updating
_generate_soniox_audio'slanguageparameter type fromOptional[str]toOptional[Language]to skip re-parsing internally.🤖 Prompt for AI Agents