From 44b34304a8e4d9098c36b96a774053f56d834a52 Mon Sep 17 00:00:00 2001 From: octo-patch Date: Wed, 24 Jun 2026 22:50:45 +0800 Subject: [PATCH] feat: add MiniMax LLM and TTS provider support Add MiniMax as a new provider for both chat (LLM) and speech (TTS): - MiniMaxLLM: subclasses OpenAILLM since MiniMax exposes an OpenAI-compatible Chat Completions API; defaults to MiniMax-M3 with base URL https://api.minimax.io/v1. - MiniMaxTTS: calls the MiniMax T2A v2 HTTP API and writes hex-decoded audio to a file, matching the AbstractTTS interface. - Add config entries for both modules in config/config.yaml. - Document MiniMax setup in README.md and README_en.md. --- README.md | 3 +- README_en.md | 3 +- bailing/llm.py | 20 +++++++++++ bailing/tts.py | 86 ++++++++++++++++++++++++++++++++++++++++++++++ config/config.yaml | 10 ++++++ 5 files changed, 120 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6e755d5..75ddcb9 100644 --- a/README.md +++ b/README.md @@ -115,7 +115,8 @@ Robot 负责高效的任务管理与记忆管理,能够智能地处理用户 - 打开config/config.yaml 配置ASR LLM等相关配置 - 下载SenseVoiceSmall到目录models/SenseVoiceSmall [SenseVoiceSmall下载地址](https://huggingface.co/FunAudioLLM/SenseVoiceSmall/tree/main) - - 去deepseek官网,获取配置api_key,[deepseek获取api_key](https://platform.deepseek.com/api_keys),当然也可以配置openai、qwen、gemini、01yi等其他模型 + - 去deepseek官网,获取配置api_key,[deepseek获取api_key](https://platform.deepseek.com/api_keys),当然也可以配置openai、qwen、gemini、01yi、MiniMax 等其他模型 + - 使用 **MiniMax**:在 `config/config.yaml` 中将 `selected_module.LLM` 设为 `MiniMaxLLM`,并在 `LLM.MiniMaxLLM` 下填入 `api_key`(默认模型 `MiniMax-M3`,Base URL `https://api.minimax.io/v1`)。MiniMax 也提供云端 TTS:将 `selected_module.TTS` 设为 `MiniMaxTTS` 并填入同一 `api_key` 即可(默认模型 `speech-2.8-hd`)。 - 如果需要使用通用AIGC配置(测试中),不可用的话,可以使用tag 分支 v0.0.1 v0.0.2 - /third_party/OpenManus/config/config.toml 需要配置里面的 model、base_url、api_key - 为支持openclaw,需要修改config/.env,配置openclaw Auth权限 diff --git a/README_en.md b/README_en.md index 18887bd..2dd980c 100644 --- a/README_en.md +++ b/README_en.md @@ -87,7 +87,8 @@ Make sure you have the following tools and libraries installed in your developme - Open `config/config.yaml` to configure ASR, LLM, etc. - Download `SenseVoiceSmall` to the `models/SenseVoiceSmall` directory [SenseVoiceSmall Download](https://huggingface.co/FunAudioLLM/SenseVoiceSmall/tree/main) - - Get an API key from the DeepSeek website: [DeepSeek API Key](https://platform.deepseek.com/api_keys), or you can configure other models such as OpenAI, Qwen, Gemini, 01yi. + - Get an API key from the DeepSeek website: [DeepSeek API Key](https://platform.deepseek.com/api_keys), or you can configure other models such as OpenAI, Qwen, Gemini, 01yi, MiniMax. + - To use **MiniMax**: set `selected_module.LLM` to `MiniMaxLLM` in `config/config.yaml` and fill in `api_key` under `LLM.MiniMaxLLM` (default model `MiniMax-M3`, base URL `https://api.minimax.io/v1`). MiniMax also provides cloud TTS: set `selected_module.TTS` to `MiniMaxTTS` and reuse the same `api_key` (default model `speech-2.8-hd`). 4. Run the project: diff --git a/bailing/llm.py b/bailing/llm.py index cd04f17..dcffd02 100644 --- a/bailing/llm.py +++ b/bailing/llm.py @@ -98,6 +98,26 @@ def response_call(self, dialogue, tools): logger.error(f"OllamaLLM tool-call error: {e}") +class MiniMaxLLM(OpenAILLM): + """MiniMax LLM provider. + + MiniMax exposes an OpenAI-compatible Chat Completions API, so we simply + reuse OpenAILLM and provide sensible defaults for the base URL and model. + Docs: https://platform.minimax.io/docs/api-reference/text-openai-api + """ + + DEFAULT_BASE_URL = "https://api.minimax.io/v1" + DEFAULT_MODEL = "MiniMax-M3" + + def __init__(self, config): + config = dict(config) + if not config.get("url"): + config["url"] = self.DEFAULT_BASE_URL + if not config.get("model_name"): + config["model_name"] = self.DEFAULT_MODEL + super().__init__(config) + + def create_instance(class_name, *args, **kwargs): # 获取类对象 cls = globals().get(class_name) diff --git a/bailing/tts.py b/bailing/tts.py index e63ce6c..2b087d3 100644 --- a/bailing/tts.py +++ b/bailing/tts.py @@ -8,6 +8,7 @@ from datetime import datetime from gtts import gTTS import edge_tts +import requests import ChatTTS import torch import torchaudio @@ -295,6 +296,91 @@ def to_tts(self, text: str) -> str: return "" +class MiniMaxTTS(AbstractTTS): + """MiniMax Text-to-Speech provider (T2A v2 HTTP API). + + Converts text to speech via MiniMax's cloud TTS service and writes the + resulting audio to a local file. The API returns hex-encoded audio (not + base64). Docs: https://platform.minimax.io/docs/api-reference/speech-t2a-http + + config keys: + - api_key: MiniMax API key (shared with the Chat API) + - voice: voice id, e.g. 'English_Graceful_Lady' (default) + - model: TTS model id, e.g. 'speech-2.8-hd' (default) + - base_url: API base url, default 'https://api.minimax.io' + - output_file: directory to write audio files to + - format: audio format, default 'mp3' + - sample_rate / bitrate / speed / vol / pitch: optional fine tuning + """ + + def __init__(self, config): + self.output_file = config.get("output_file", "tmp/") + self.api_key = config.get("api_key") + self.voice = config.get("voice", "English_Graceful_Lady") + self.model = config.get("model", "speech-2.8-hd") + self.base_url = config.get("base_url", "https://api.minimax.io").rstrip("/") + self.format = config.get("format", "mp3") + self.sample_rate = config.get("sample_rate", 32000) + self.bitrate = config.get("bitrate", 128000) + self.speed = config.get("speed", 1) + self.vol = config.get("vol", 1) + self.pitch = config.get("pitch", 0) + + def _generate_filename(self, extension=".mp3"): + return os.path.join(self.output_file, f"tts-{datetime.now().date()}@{uuid.uuid4().hex}{extension}") + + def _log_execution_time(self, start_time): + end_time = time.time() + execution_time = end_time - start_time + logger.debug(f"Execution Time: {execution_time:.2f} seconds") + + def to_tts(self, text): + tmpfile = self._generate_filename(f".{self.format}") + start_time = time.time() + payload = { + "model": self.model, + "text": text, + "stream": False, + "voice_setting": { + "voice_id": self.voice, + "speed": self.speed, + "vol": self.vol, + "pitch": self.pitch, + }, + "audio_setting": { + "sample_rate": self.sample_rate, + "bitrate": self.bitrate, + "format": self.format, + "channel": 1, + }, + } + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + try: + response = requests.post( + f"{self.base_url}/v1/t2a_v2", headers=headers, json=payload, timeout=30 + ) + response.raise_for_status() + result = response.json() + base_resp = result.get("base_resp", {}) + if base_resp.get("status_code") != 0: + logger.error(f"MiniMax TTS failed: {base_resp.get('status_msg')}") + return None + audio_hex = result.get("data", {}).get("audio") + if not audio_hex: + logger.error("MiniMax TTS returned empty audio") + return None + with open(tmpfile, "wb") as f: + f.write(bytes.fromhex(audio_hex)) # MiniMax returns hex, not base64 + self._log_execution_time(start_time) + return tmpfile + except Exception as e: + logger.error(f"Failed to generate TTS file: {e}") + return None + + def create_instance(class_name, *args, **kwargs): # 获取类对象 cls = globals().get(class_name) diff --git a/config/config.yaml b/config/config.yaml index fc0f232..32ae8fa 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -42,6 +42,10 @@ LLM: model_name: deepseek-chat url: https://api.deepseek.com api_key: + MiniMaxLLM: + model_name: MiniMax-M3 + url: https://api.minimax.io/v1 + api_key: OllamaLLM: model_name: qwen-chat url: http://localhost:11434/api/chat # 注意后缀是api/chat @@ -67,6 +71,12 @@ TTS: lang: z voice: zf_001 repo_id: hexgrad/Kokoro-82M-v1.1-zh + MiniMaxTTS: + output_file: tmp/ + api_key: + voice: English_Graceful_Lady + model: speech-2.8-hd + base_url: https://api.minimax.io Player: PygameSoundPlayer: null