Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ Robot 负责高效的任务管理与记忆管理,能够智能地处理用户

- 打开config/config.yaml 配置ASR LLM等相关配置
- 下载SenseVoiceSmall到目录models/SenseVoiceSmall [SenseVoiceSmall下载地址](https://huggingface.co/FunAudioLLM/SenseVoiceSmall/tree/main)
- 去deepseek官网,获取配置api_key,[deepseek获取api_key](https://platform.deepseek.com/api_keys),当然也可以配置openai、qwen、gemini、01yi等其他模型
- 去deepseek官网,获取配置api_key,[deepseek获取api_key](https://platform.deepseek.com/api_keys),当然也可以配置openai、qwen、gemini、01yi、MiniMax 等其他模型
- 使用 **MiniMax**:在 `config/config.yaml` 中将 `selected_module.LLM` 设为 `MiniMaxLLM`,并在 `LLM.MiniMaxLLM` 下填入 `api_key`(默认模型 `MiniMax-M3`,Base URL `https://api.minimax.io/v1`)。MiniMax 也提供云端 TTS:将 `selected_module.TTS` 设为 `MiniMaxTTS` 并填入同一 `api_key` 即可(默认模型 `speech-2.8-hd`)。
- 如果需要使用通用AIGC配置(测试中),不可用的话,可以使用tag 分支 v0.0.1 v0.0.2
- /third_party/OpenManus/config/config.toml 需要配置里面的 model、base_url、api_key
- 为支持openclaw,需要修改config/.env,配置openclaw Auth权限
Expand Down
3 changes: 2 additions & 1 deletion README_en.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ Make sure you have the following tools and libraries installed in your developme

- Open `config/config.yaml` to configure ASR, LLM, etc.
- Download `SenseVoiceSmall` to the `models/SenseVoiceSmall` directory [SenseVoiceSmall Download](https://huggingface.co/FunAudioLLM/SenseVoiceSmall/tree/main)
- Get an API key from the DeepSeek website: [DeepSeek API Key](https://platform.deepseek.com/api_keys), or you can configure other models such as OpenAI, Qwen, Gemini, 01yi.
- Get an API key from the DeepSeek website: [DeepSeek API Key](https://platform.deepseek.com/api_keys), or you can configure other models such as OpenAI, Qwen, Gemini, 01yi, MiniMax.
- To use **MiniMax**: set `selected_module.LLM` to `MiniMaxLLM` in `config/config.yaml` and fill in `api_key` under `LLM.MiniMaxLLM` (default model `MiniMax-M3`, base URL `https://api.minimax.io/v1`). MiniMax also provides cloud TTS: set `selected_module.TTS` to `MiniMaxTTS` and reuse the same `api_key` (default model `speech-2.8-hd`).

4. Run the project:

Expand Down
20 changes: 20 additions & 0 deletions bailing/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,26 @@ def response_call(self, dialogue, tools):
logger.error(f"OllamaLLM tool-call error: {e}")


class MiniMaxLLM(OpenAILLM):
"""MiniMax LLM provider.

MiniMax exposes an OpenAI-compatible Chat Completions API, so we simply
reuse OpenAILLM and provide sensible defaults for the base URL and model.
Docs: https://platform.minimax.io/docs/api-reference/text-openai-api
"""

DEFAULT_BASE_URL = "https://api.minimax.io/v1"
DEFAULT_MODEL = "MiniMax-M3"

def __init__(self, config):
config = dict(config)
if not config.get("url"):
config["url"] = self.DEFAULT_BASE_URL
if not config.get("model_name"):
config["model_name"] = self.DEFAULT_MODEL
super().__init__(config)


def create_instance(class_name, *args, **kwargs):
# 获取类对象
cls = globals().get(class_name)
Expand Down
86 changes: 86 additions & 0 deletions bailing/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from datetime import datetime
from gtts import gTTS
import edge_tts
import requests
import ChatTTS
import torch
import torchaudio
Expand Down Expand Up @@ -295,6 +296,91 @@ def to_tts(self, text: str) -> str:
return ""


class MiniMaxTTS(AbstractTTS):
"""MiniMax Text-to-Speech provider (T2A v2 HTTP API).

Converts text to speech via MiniMax's cloud TTS service and writes the
resulting audio to a local file. The API returns hex-encoded audio (not
base64). Docs: https://platform.minimax.io/docs/api-reference/speech-t2a-http

config keys:
- api_key: MiniMax API key (shared with the Chat API)
- voice: voice id, e.g. 'English_Graceful_Lady' (default)
- model: TTS model id, e.g. 'speech-2.8-hd' (default)
- base_url: API base url, default 'https://api.minimax.io'
- output_file: directory to write audio files to
- format: audio format, default 'mp3'
- sample_rate / bitrate / speed / vol / pitch: optional fine tuning
"""

def __init__(self, config):
self.output_file = config.get("output_file", "tmp/")
self.api_key = config.get("api_key")
self.voice = config.get("voice", "English_Graceful_Lady")
self.model = config.get("model", "speech-2.8-hd")
self.base_url = config.get("base_url", "https://api.minimax.io").rstrip("/")
self.format = config.get("format", "mp3")
self.sample_rate = config.get("sample_rate", 32000)
self.bitrate = config.get("bitrate", 128000)
self.speed = config.get("speed", 1)
self.vol = config.get("vol", 1)
self.pitch = config.get("pitch", 0)

def _generate_filename(self, extension=".mp3"):
return os.path.join(self.output_file, f"tts-{datetime.now().date()}@{uuid.uuid4().hex}{extension}")

def _log_execution_time(self, start_time):
end_time = time.time()
execution_time = end_time - start_time
logger.debug(f"Execution Time: {execution_time:.2f} seconds")

def to_tts(self, text):
tmpfile = self._generate_filename(f".{self.format}")
start_time = time.time()
payload = {
"model": self.model,
"text": text,
"stream": False,
"voice_setting": {
"voice_id": self.voice,
"speed": self.speed,
"vol": self.vol,
"pitch": self.pitch,
},
"audio_setting": {
"sample_rate": self.sample_rate,
"bitrate": self.bitrate,
"format": self.format,
"channel": 1,
},
}
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
try:
response = requests.post(
f"{self.base_url}/v1/t2a_v2", headers=headers, json=payload, timeout=30
)
response.raise_for_status()
result = response.json()
base_resp = result.get("base_resp", {})
if base_resp.get("status_code") != 0:
logger.error(f"MiniMax TTS failed: {base_resp.get('status_msg')}")
return None
audio_hex = result.get("data", {}).get("audio")
if not audio_hex:
logger.error("MiniMax TTS returned empty audio")
return None
with open(tmpfile, "wb") as f:
f.write(bytes.fromhex(audio_hex)) # MiniMax returns hex, not base64
self._log_execution_time(start_time)
return tmpfile
except Exception as e:
logger.error(f"Failed to generate TTS file: {e}")
return None


def create_instance(class_name, *args, **kwargs):
# 获取类对象
cls = globals().get(class_name)
Expand Down
10 changes: 10 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ LLM:
model_name: deepseek-chat
url: https://api.deepseek.com
api_key:
MiniMaxLLM:
model_name: MiniMax-M3
url: https://api.minimax.io/v1
api_key:
OllamaLLM:
model_name: qwen-chat
url: http://localhost:11434/api/chat # 注意后缀是api/chat
Expand All @@ -67,6 +71,12 @@ TTS:
lang: z
voice: zf_001
repo_id: hexgrad/Kokoro-82M-v1.1-zh
MiniMaxTTS:
output_file: tmp/
api_key:
voice: English_Graceful_Lady
model: speech-2.8-hd
base_url: https://api.minimax.io

Player:
PygameSoundPlayer: null
Expand Down