Skip to content

Commit 2ef9b45

Browse files
committed
feat(model/cosyvoice):support http sse api
1 parent d8287d1 commit 2ef9b45

3 files changed

Lines changed: 358 additions & 0 deletions

File tree

dashscope/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
from dashscope.assistants import Assistant, AssistantList, Assistants
1818
from dashscope.assistants.assistant_types import AssistantFile, DeleteResponse
1919
from dashscope.audio.asr.transcription import Transcription
20+
from dashscope.audio.http_tts.http_speech_synthesizer import (
21+
HttpSpeechSynthesizer,
22+
)
2023
from dashscope.audio.tts.speech_synthesizer import SpeechSynthesizer
2124
from dashscope.common.api_key import save_api_key
2225
from dashscope.common.env import (
@@ -88,6 +91,7 @@
8891
"MultiModalEmbeddingItemImage",
8992
"MultiModalEmbeddingItemText",
9093
"SpeechSynthesizer",
94+
"HttpSpeechSynthesizer",
9195
"MultiModalConversation",
9296
"AioMultiModalConversation",
9397
"BatchTextEmbedding",
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (c) Alibaba, Inc. and its affiliates.
3+
4+
from dashscope.audio.http_tts.http_speech_synthesizer import (
5+
HttpSpeechSynthesisResult,
6+
HttpSpeechSynthesizer,
7+
)
8+
9+
__all__ = [
10+
"HttpSpeechSynthesizer",
11+
"HttpSpeechSynthesisResult",
12+
]
Lines changed: 342 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,342 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (c) Alibaba, Inc. and its affiliates.
3+
4+
from http import HTTPStatus
5+
from typing import Dict, Iterator, List, Optional, Union
6+
7+
from dashscope.api_entities.dashscope_response import (
8+
DashScopeAPIResponse,
9+
SpeechSynthesisResponse,
10+
)
11+
from dashscope.client.base_api import BaseApi
12+
from dashscope.common.constants import HTTPMethod
13+
14+
15+
class HttpSpeechSynthesisResult:
16+
"""The result of HTTP speech synthesis."""
17+
18+
def __init__(
19+
self,
20+
audio_data: Optional[bytes] = None,
21+
audio_url: Optional[str] = None,
22+
audio_id: Optional[str] = None,
23+
expires_at: Optional[int] = None,
24+
sentences: Optional[List[Dict]] = None,
25+
response: Optional[SpeechSynthesisResponse] = None,
26+
):
27+
self._audio_data = audio_data
28+
self._audio_url = audio_url
29+
self._audio_id = audio_id
30+
self._expires_at = expires_at
31+
self._sentences = sentences or []
32+
self._response = response
33+
34+
@property
35+
def audio_data(self) -> Optional[bytes]:
36+
"""Get the audio data (for streaming mode)."""
37+
return self._audio_data
38+
39+
@property
40+
def audio_url(self) -> Optional[str]:
41+
"""Get the audio URL (for non-streaming mode)."""
42+
return self._audio_url
43+
44+
@property
45+
def audio_id(self) -> Optional[str]:
46+
"""Get the audio ID."""
47+
return self._audio_id
48+
49+
@property
50+
def expires_at(self) -> Optional[int]:
51+
"""Get the URL expiration timestamp."""
52+
return self._expires_at
53+
54+
@property
55+
def sentences(self) -> List[Dict]:
56+
"""Get the sentence-level synthesis results (for streaming mode)."""
57+
return self._sentences
58+
59+
@property
60+
def response(self) -> Optional[SpeechSynthesisResponse]:
61+
"""Get the full API response."""
62+
return self._response
63+
64+
65+
class HttpSpeechSynthesizer(BaseApi):
66+
"""HTTP-based text-to-speech interface for CosyVoice."""
67+
68+
class AudioFormat:
69+
WAV = "wav"
70+
PCM = "pcm"
71+
MP3 = "mp3"
72+
73+
@classmethod
74+
def call( # pylint: disable=arguments-renamed
75+
cls,
76+
model: str,
77+
text: str,
78+
voice: str,
79+
audio_format: str = "wav",
80+
sample_rate: int = 24000,
81+
stream: bool = False,
82+
workspace: Optional[str] = None,
83+
api_key: Optional[str] = None,
84+
url: Optional[str] = None,
85+
**kwargs,
86+
) -> Union[HttpSpeechSynthesisResult, Iterator[HttpSpeechSynthesisResult]]:
87+
"""Convert text to speech via HTTP API.
88+
89+
Args:
90+
model (str): The speech synthesis model, e.g.,
91+
'cosyvoice-v3-flash'.
92+
text (str): The text to synthesize.
93+
voice (str): The voice to use for synthesis.
94+
audio_format (str): Audio encoding format ('wav', 'pcm', 'mp3').
95+
Defaults to 'wav'.
96+
sample_rate (int): Audio sample rate in Hz. Defaults to 24000.
97+
stream (bool): Whether to use streaming (SSE) mode.
98+
Defaults to False.
99+
workspace (str): The DashScope workspace ID.
100+
api_key (str): The DashScope API key.
101+
url (str): custom http url if needed.
102+
**kwargs: Additional parameters like volume, rate, pitch, etc.
103+
104+
Returns:
105+
HttpSpeechSynthesisResult: For non-streaming mode.
106+
Iterator[HttpSpeechSynthesisResult]: For streaming mode.
107+
"""
108+
# Build request body
109+
body = {
110+
"model": model,
111+
"input": {
112+
"text": text,
113+
"voice": voice,
114+
"format": audio_format,
115+
"sample_rate": sample_rate,
116+
**{k: v for k, v in kwargs.items() if v is not None},
117+
},
118+
}
119+
120+
# Prepare headers
121+
headers = {}
122+
if stream:
123+
headers["X-DashScope-SSE"] = "enable"
124+
125+
# Make the HTTP request
126+
response = cls._http_call(
127+
method=HTTPMethod.POST,
128+
body=body,
129+
headers=headers if headers else None,
130+
stream=stream,
131+
workspace=workspace,
132+
api_key=api_key,
133+
url=url,
134+
)
135+
136+
if stream:
137+
return cls._handle_streaming_response(response)
138+
else:
139+
return cls._handle_non_streaming_response(response)
140+
141+
@classmethod
142+
def _http_call(
143+
cls,
144+
method: str,
145+
body: Dict,
146+
headers: Optional[Dict] = None,
147+
stream: bool = False,
148+
workspace: Optional[str] = None,
149+
api_key: Optional[str] = None,
150+
url: Optional[str] = None,
151+
):
152+
"""Make HTTP API call using BaseApi infrastructure."""
153+
from dashscope.api_entities.http_request import HttpRequest
154+
155+
# Get base URL
156+
import dashscope
157+
from dashscope.common.utils import join_url
158+
159+
if url:
160+
base_url = url
161+
else:
162+
base_url = dashscope.base_http_api_url
163+
url_for_call = join_url(
164+
base_url,
165+
"services/audio/tts/SpeechSynthesizer",
166+
)
167+
168+
# Get API key
169+
from dashscope.common.api_key import get_default_api_key
170+
171+
if api_key is None:
172+
api_key = get_default_api_key()
173+
174+
# Prepare workspace header
175+
workspace_headers = {}
176+
if workspace:
177+
workspace_headers["X-DashScope-Workspace"] = workspace
178+
179+
# Create request
180+
request = HttpRequest(
181+
url=url_for_call,
182+
api_key=api_key,
183+
http_method=method,
184+
stream=stream,
185+
)
186+
187+
# Add custom headers
188+
if headers:
189+
request.add_headers(headers)
190+
if workspace_headers:
191+
request.add_headers(workspace_headers)
192+
193+
# Set request body
194+
request.data = _RequestData(body)
195+
196+
return request.call()
197+
198+
@classmethod
199+
def _handle_non_streaming_response(
200+
cls,
201+
response,
202+
) -> HttpSpeechSynthesisResult:
203+
"""Handle non-streaming response."""
204+
# Get the response
205+
api_response = response
206+
207+
if isinstance(api_response, DashScopeAPIResponse):
208+
if api_response.status_code != HTTPStatus.OK:
209+
raise RuntimeError(
210+
f"Request failed with status {api_response.status_code}: "
211+
f"{api_response.message}",
212+
)
213+
214+
output = api_response.output or {}
215+
audio_info = output.get("audio", {})
216+
217+
return HttpSpeechSynthesisResult(
218+
audio_url=audio_info.get("url"),
219+
audio_id=audio_info.get("id"),
220+
expires_at=audio_info.get("expires_at"),
221+
)
222+
else:
223+
# Handle raw dict response
224+
output = api_response.get("output", {})
225+
audio_info = output.get("audio", {})
226+
227+
return HttpSpeechSynthesisResult(
228+
audio_url=audio_info.get("url"),
229+
audio_id=audio_info.get("id"),
230+
expires_at=audio_info.get("expires_at"),
231+
)
232+
233+
@classmethod
234+
def _handle_streaming_response(
235+
cls,
236+
response,
237+
) -> Iterator[HttpSpeechSynthesisResult]:
238+
"""Handle streaming (SSE) response."""
239+
audio_data_parts = []
240+
sentences = []
241+
242+
for part in response:
243+
if isinstance(part, DashScopeAPIResponse):
244+
if part.status_code != HTTPStatus.OK:
245+
raise RuntimeError(
246+
f"Stream error with status {part.status_code}: "
247+
f"{part.message}",
248+
)
249+
250+
output = part.output or {}
251+
output_type = output.get("type", "")
252+
253+
# Handle sentence events
254+
if output_type.startswith("sentence-"):
255+
sentence_info = output.get("sentence", {})
256+
if sentence_info:
257+
sentences.append(sentence_info)
258+
259+
# Yield intermediate result with audio frame
260+
audio_data = output.get("audio", {}).get("data")
261+
if audio_data:
262+
import base64
263+
264+
audio_bytes = base64.b64decode(audio_data)
265+
audio_data_parts.append(audio_bytes)
266+
yield HttpSpeechSynthesisResult(
267+
audio_data=audio_bytes,
268+
sentences=sentences.copy(),
269+
response=(
270+
SpeechSynthesisResponse.from_api_response(part)
271+
),
272+
)
273+
274+
# Handle final message
275+
elif output.get("finish_reason") == "stop":
276+
audio_info = output.get("audio", {})
277+
278+
yield HttpSpeechSynthesisResult(
279+
audio_data=(
280+
b"".join(audio_data_parts)
281+
if audio_data_parts
282+
else None
283+
),
284+
audio_url=audio_info.get("url"),
285+
audio_id=audio_info.get("id"),
286+
expires_at=audio_info.get("expires_at"),
287+
sentences=sentences.copy(),
288+
)
289+
else:
290+
# Handle raw dict
291+
output = part.get("output", {})
292+
output_type = output.get("type", "")
293+
294+
if output_type.startswith("sentence-"):
295+
sentence_info = output.get("sentence", {})
296+
if sentence_info:
297+
sentences.append(sentence_info)
298+
299+
audio_data = output.get("audio", {}).get("data")
300+
if audio_data:
301+
import base64
302+
303+
audio_bytes = base64.b64decode(audio_data)
304+
audio_data_parts.append(audio_bytes)
305+
yield HttpSpeechSynthesisResult(
306+
audio_data=audio_bytes,
307+
sentences=sentences.copy(),
308+
)
309+
310+
elif output.get("finish_reason") == "stop":
311+
audio_info = output.get("audio", {})
312+
yield HttpSpeechSynthesisResult(
313+
audio_data=(
314+
b"".join(audio_data_parts)
315+
if audio_data_parts
316+
else None
317+
),
318+
audio_url=audio_info.get("url"),
319+
audio_id=audio_info.get("id"),
320+
expires_at=audio_info.get("expires_at"),
321+
sentences=sentences.copy(),
322+
)
323+
324+
325+
class _RequestData:
326+
"""Wrapper for request data to provide required interface."""
327+
328+
def __init__(self, data: Dict):
329+
self._data = data
330+
331+
def get_http_payload(self):
332+
"""Return HTTP payload."""
333+
return False, None, self._data
334+
335+
def get_aiohttp_payload(self):
336+
"""Return aiohttp payload."""
337+
return False, self._data
338+
339+
@property
340+
def parameters(self):
341+
"""Return query parameters."""
342+
return {}

0 commit comments

Comments
 (0)