Skip to content

Commit 88686c4

Browse files
committed
feat(model/cosyvoice):support http sse api
1 parent d8287d1 commit 88686c4

4 files changed

Lines changed: 311 additions & 2 deletions

File tree

dashscope/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
from dashscope.assistants import Assistant, AssistantList, Assistants
1818
from dashscope.assistants.assistant_types import AssistantFile, DeleteResponse
1919
from dashscope.audio.asr.transcription import Transcription
20+
from dashscope.audio.http_tts.http_speech_synthesizer import (
21+
HttpSpeechSynthesizer,
22+
)
2023
from dashscope.audio.tts.speech_synthesizer import SpeechSynthesizer
2124
from dashscope.common.api_key import save_api_key
2225
from dashscope.common.env import (
@@ -88,6 +91,7 @@
8891
"MultiModalEmbeddingItemImage",
8992
"MultiModalEmbeddingItemText",
9093
"SpeechSynthesizer",
94+
"HttpSpeechSynthesizer",
9195
"MultiModalConversation",
9296
"AioMultiModalConversation",
9397
"BatchTextEmbedding",
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (c) Alibaba, Inc. and its affiliates.
3+
4+
from dashscope.audio.http_tts.http_speech_synthesizer import (
5+
HttpSpeechSynthesisResult,
6+
HttpSpeechSynthesizer,
7+
)
8+
9+
__all__ = [
10+
"HttpSpeechSynthesizer",
11+
"HttpSpeechSynthesisResult",
12+
]
Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (c) Alibaba, Inc. and its affiliates.
3+
4+
import base64
5+
from http import HTTPStatus
6+
from typing import Dict, Iterator, List, Optional, Union
7+
8+
from dashscope.api_entities.dashscope_response import (
9+
DashScopeAPIResponse,
10+
SpeechSynthesisResponse,
11+
)
12+
from dashscope.client.base_api import BaseApi
13+
from dashscope.common.constants import HTTPMethod
14+
15+
16+
class HttpSpeechSynthesisResult:
17+
"""The result of HTTP speech synthesis."""
18+
19+
def __init__(
20+
self,
21+
audio_data: Optional[bytes] = None,
22+
audio_url: Optional[str] = None,
23+
audio_id: Optional[str] = None,
24+
expires_at: Optional[int] = None,
25+
sentences: Optional[List[Dict]] = None,
26+
response: Optional[SpeechSynthesisResponse] = None,
27+
):
28+
self._audio_data = audio_data
29+
self._audio_url = audio_url
30+
self._audio_id = audio_id
31+
self._expires_at = expires_at
32+
self._sentences = sentences or []
33+
self._response = response
34+
35+
@property
36+
def audio_data(self) -> Optional[bytes]:
37+
"""Get the audio data (for streaming mode)."""
38+
return self._audio_data
39+
40+
@property
41+
def audio_url(self) -> Optional[str]:
42+
"""Get the audio URL (for non-streaming mode)."""
43+
return self._audio_url
44+
45+
@property
46+
def audio_id(self) -> Optional[str]:
47+
"""Get the audio ID."""
48+
return self._audio_id
49+
50+
@property
51+
def expires_at(self) -> Optional[int]:
52+
"""Get the URL expiration timestamp."""
53+
return self._expires_at
54+
55+
@property
56+
def sentences(self) -> List[Dict]:
57+
"""Get the sentence-level synthesis results (for streaming mode)."""
58+
return self._sentences
59+
60+
@property
61+
def response(self) -> Optional[SpeechSynthesisResponse]:
62+
"""Get the full API response."""
63+
return self._response
64+
65+
66+
class HttpSpeechSynthesizer(BaseApi):
67+
"""HTTP-based text-to-speech interface for CosyVoice."""
68+
69+
class AudioFormat:
70+
WAV = "wav"
71+
PCM = "pcm"
72+
MP3 = "mp3"
73+
74+
@classmethod
75+
def call( # type: ignore # pylint: disable=arguments-renamed
76+
cls,
77+
model: str,
78+
text: str,
79+
voice: str,
80+
audio_format: str = "wav",
81+
sample_rate: int = 24000,
82+
stream: bool = False,
83+
workspace: Optional[str] = None,
84+
api_key: Optional[str] = None,
85+
url: Optional[str] = None,
86+
**kwargs,
87+
) -> Union[HttpSpeechSynthesisResult, Iterator[HttpSpeechSynthesisResult]]:
88+
"""Convert text to speech via HTTP API.
89+
90+
Args:
91+
model (str): The speech synthesis model, e.g.,
92+
'cosyvoice-v3-flash'.
93+
text (str): The text to synthesize.
94+
voice (str): The voice to use for synthesis.
95+
audio_format (str): Audio encoding format ('wav', 'pcm', 'mp3').
96+
Defaults to 'wav'.
97+
sample_rate (int): Audio sample rate in Hz. Defaults to 24000.
98+
stream (bool): Whether to use streaming (SSE) mode.
99+
Defaults to False.
100+
workspace (str): The DashScope workspace ID.
101+
api_key (str): The DashScope API key.
102+
url (str): custom http url if needed.
103+
**kwargs: Additional parameters like volume, rate, pitch, etc.
104+
105+
Returns:
106+
HttpSpeechSynthesisResult: For non-streaming mode.
107+
Iterator[HttpSpeechSynthesisResult]: For streaming mode.
108+
"""
109+
# Build request body
110+
body = {
111+
"model": model,
112+
"input": {
113+
"text": text,
114+
"voice": voice,
115+
"format": audio_format,
116+
"sample_rate": sample_rate,
117+
**{k: v for k, v in kwargs.items() if v is not None},
118+
},
119+
}
120+
121+
# Prepare headers
122+
headers = {}
123+
if stream:
124+
headers["X-DashScope-SSE"] = "enable"
125+
126+
# Make the HTTP request
127+
response = cls._http_call(
128+
method=HTTPMethod.POST,
129+
body=body,
130+
headers=headers if headers else None,
131+
stream=stream,
132+
workspace=workspace,
133+
api_key=api_key,
134+
url=url,
135+
)
136+
137+
if stream:
138+
return cls._handle_streaming_response(response)
139+
else:
140+
return cls._handle_non_streaming_response(response)
141+
142+
@classmethod
143+
def _http_call(
144+
cls,
145+
method: str,
146+
body: Dict,
147+
headers: Optional[Dict] = None,
148+
stream: bool = False,
149+
workspace: Optional[str] = None,
150+
api_key: Optional[str] = None,
151+
url: Optional[str] = None,
152+
):
153+
"""Make HTTP API call using BaseApi infrastructure."""
154+
from dashscope.api_entities.http_request import HttpRequest
155+
156+
# Get base URL
157+
import dashscope
158+
from dashscope.common.utils import join_url
159+
160+
if url:
161+
base_url = url
162+
else:
163+
base_url = dashscope.base_http_api_url
164+
url_for_call = join_url(
165+
base_url,
166+
"services/audio/tts/SpeechSynthesizer",
167+
)
168+
169+
# Get API key
170+
from dashscope.common.api_key import get_default_api_key
171+
172+
if api_key is None:
173+
api_key = get_default_api_key()
174+
175+
# Prepare workspace header
176+
workspace_headers = {}
177+
if workspace:
178+
workspace_headers["X-DashScope-Workspace"] = workspace
179+
180+
# Create request
181+
request = HttpRequest(
182+
url=url_for_call,
183+
api_key=api_key,
184+
http_method=method,
185+
stream=stream,
186+
)
187+
188+
# Add custom headers
189+
if headers:
190+
request.add_headers(headers)
191+
if workspace_headers:
192+
request.add_headers(workspace_headers)
193+
194+
# Set request body
195+
request.data = _RequestData(body)
196+
197+
return request.call()
198+
199+
@staticmethod
200+
def _extract_output(part):
201+
"""Extract output dict from a response part.
202+
203+
Handles both DashScopeAPIResponse and raw dict,
204+
raising on error status.
205+
"""
206+
if isinstance(part, DashScopeAPIResponse):
207+
if part.status_code != HTTPStatus.OK:
208+
raise RuntimeError(
209+
f"Request failed: {part.status_code} " f"{part.message}",
210+
)
211+
return part.output or {}
212+
213+
return part.get("output", {})
214+
215+
@classmethod
216+
def _handle_non_streaming_response(
217+
cls,
218+
response,
219+
) -> HttpSpeechSynthesisResult:
220+
"""Handle non-streaming response."""
221+
output = cls._extract_output(response)
222+
audio_info = output.get("audio", {})
223+
224+
return HttpSpeechSynthesisResult(
225+
audio_url=audio_info.get("url"),
226+
audio_id=audio_info.get("id"),
227+
expires_at=audio_info.get("expires_at"),
228+
)
229+
230+
@classmethod
231+
def _handle_streaming_response(
232+
cls,
233+
response,
234+
) -> Iterator[HttpSpeechSynthesisResult]:
235+
"""Handle streaming (SSE) response."""
236+
audio_data_parts: List[bytes] = []
237+
sentences: List[Dict] = []
238+
239+
for part in response:
240+
output = cls._extract_output(part)
241+
output_type = output.get("type", "")
242+
243+
if output_type.startswith("sentence-"):
244+
sentence_info = output.get("sentence", {})
245+
if sentence_info:
246+
sentences.append(sentence_info)
247+
248+
audio_data = output.get("audio", {}).get("data")
249+
if audio_data:
250+
audio_bytes = base64.b64decode(
251+
audio_data,
252+
)
253+
audio_data_parts.append(audio_bytes)
254+
yield HttpSpeechSynthesisResult(
255+
audio_data=audio_bytes,
256+
sentences=sentences.copy(),
257+
)
258+
259+
elif output.get("finish_reason") == "stop":
260+
audio_info = output.get("audio", {})
261+
yield HttpSpeechSynthesisResult(
262+
audio_data=(
263+
b"".join(audio_data_parts)
264+
if audio_data_parts
265+
else None
266+
),
267+
audio_url=audio_info.get("url"),
268+
audio_id=audio_info.get("id"),
269+
expires_at=audio_info.get(
270+
"expires_at",
271+
),
272+
sentences=sentences.copy(),
273+
)
274+
275+
276+
class _RequestData:
277+
"""Wrapper for request data to provide required interface."""
278+
279+
def __init__(self, data: Dict):
280+
self._data = data
281+
282+
def get_http_payload(self):
283+
"""Return HTTP payload."""
284+
return False, None, self._data
285+
286+
def get_aiohttp_payload(self):
287+
"""Return aiohttp payload."""
288+
return False, self._data
289+
290+
@property
291+
def parameters(self):
292+
"""Return query parameters."""
293+
return {}

dashscope/audio/qwen_omni/omni_realtime.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ def connect(self) -> None:
199199
if not (self.ws.sock and self.ws.sock.connected):
200200
raise TimeoutError(
201201
"websocket connection could not established within 5s. "
202-
"Please check your network connection, firewall settings,"
202+
"Please check your network connection, firewall settings, "
203203
"or server status.",
204204
)
205205
self.callback.on_open()
@@ -593,8 +593,8 @@ def _on_close( # pylint: disable=unused-argument
593593

594594
# WebSocket发生错误的回调函数
595595
def _on_error(self, ws, error): # pylint: disable=unused-argument
596-
print(f"websocket closed due to {error}")
597596
# pylint: disable=broad-exception-raised
597+
logger.error("websocket closed due to %s", error)
598598
raise Exception(f"websocket closed due to {error}")
599599

600600
# 获取上一个任务的taskId

0 commit comments

Comments
 (0)