From 44b34304a8e4d9098c36b96a774053f56d834a52 Mon Sep 17 00:00:00 2001
From: octo-patch <liyuan851277048@icloud.com>
Date: Wed, 24 Jun 2026 22:50:45 +0800
Subject: [PATCH] feat: add MiniMax LLM and TTS provider support

Add MiniMax as a new provider for both chat (LLM) and speech (TTS):

- MiniMaxLLM: subclasses OpenAILLM since MiniMax exposes an
  OpenAI-compatible Chat Completions API; defaults to MiniMax-M3 with
  base URL https://api.minimax.io/v1.
- MiniMaxTTS: calls the MiniMax T2A v2 HTTP API and writes hex-decoded
  audio to a file, matching the AbstractTTS interface.
- Add config entries for both modules in config/config.yaml.
- Document MiniMax setup in README.md and README_en.md.
---
 README.md          |  3 +-
 README_en.md       |  3 +-
 bailing/llm.py     | 20 +++++++++++
 bailing/tts.py     | 86 ++++++++++++++++++++++++++++++++++++++++++++++
 config/config.yaml | 10 ++++++
 5 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 6e755d5..75ddcb9 100644
--- a/README.md
+++ b/README.md
@@ -115,7 +115,8 @@ Robot 负责高效的任务管理与记忆管理，能够智能地处理用户
 
      - 打开config/config.yaml 配置ASR LLM等相关配置
      - 下载SenseVoiceSmall到目录models/SenseVoiceSmall [SenseVoiceSmall下载地址](https://huggingface.co/FunAudioLLM/SenseVoiceSmall/tree/main)
-     - 去deepseek官网，获取配置api_key，[deepseek获取api_key](https://platform.deepseek.com/api_keys)，当然也可以配置openai、qwen、gemini、01yi等其他模型
+     - 去deepseek官网，获取配置api_key，[deepseek获取api_key](https://platform.deepseek.com/api_keys)，当然也可以配置openai、qwen、gemini、01yi、MiniMax 等其他模型
+       - 使用 **MiniMax**：在 `config/config.yaml` 中将 `selected_module.LLM` 设为 `MiniMaxLLM`，并在 `LLM.MiniMaxLLM` 下填入 `api_key`（默认模型 `MiniMax-M3`，Base URL `https://api.minimax.io/v1`）。MiniMax 也提供云端 TTS：将 `selected_module.TTS` 设为 `MiniMaxTTS` 并填入同一 `api_key` 即可（默认模型 `speech-2.8-hd`）。
      - 如果需要使用通用AIGC配置（测试中），不可用的话，可以使用tag 分支 v0.0.1 v0.0.2 
        - /third_party/OpenManus/config/config.toml  需要配置里面的 model、base_url、api_key 
      - 为支持openclaw，需要修改config/.env，配置openclaw Auth权限
diff --git a/README_en.md b/README_en.md
index 18887bd..2dd980c 100644
--- a/README_en.md
+++ b/README_en.md
@@ -87,7 +87,8 @@ Make sure you have the following tools and libraries installed in your developme
 
      - Open `config/config.yaml` to configure ASR, LLM, etc.
      - Download `SenseVoiceSmall` to the `models/SenseVoiceSmall` directory [SenseVoiceSmall Download](https://huggingface.co/FunAudioLLM/SenseVoiceSmall/tree/main)
-     - Get an API key from the DeepSeek website: [DeepSeek API Key](https://platform.deepseek.com/api_keys), or you can configure other models such as OpenAI, Qwen, Gemini, 01yi.
+     - Get an API key from the DeepSeek website: [DeepSeek API Key](https://platform.deepseek.com/api_keys), or you can configure other models such as OpenAI, Qwen, Gemini, 01yi, MiniMax.
+       - To use **MiniMax**: set `selected_module.LLM` to `MiniMaxLLM` in `config/config.yaml` and fill in `api_key` under `LLM.MiniMaxLLM` (default model `MiniMax-M3`, base URL `https://api.minimax.io/v1`). MiniMax also provides cloud TTS: set `selected_module.TTS` to `MiniMaxTTS` and reuse the same `api_key` (default model `speech-2.8-hd`).
 
 4. Run the project:
 
diff --git a/bailing/llm.py b/bailing/llm.py
index cd04f17..dcffd02 100644
--- a/bailing/llm.py
+++ b/bailing/llm.py
@@ -98,6 +98,26 @@ def response_call(self, dialogue, tools):
             logger.error(f"OllamaLLM tool-call error: {e}")
 
 
+class MiniMaxLLM(OpenAILLM):
+    """MiniMax LLM provider.
+
+    MiniMax exposes an OpenAI-compatible Chat Completions API, so we simply
+    reuse OpenAILLM and provide sensible defaults for the base URL and model.
+    Docs: https://platform.minimax.io/docs/api-reference/text-openai-api
+    """
+
+    DEFAULT_BASE_URL = "https://api.minimax.io/v1"
+    DEFAULT_MODEL = "MiniMax-M3"
+
+    def __init__(self, config):
+        config = dict(config)
+        if not config.get("url"):
+            config["url"] = self.DEFAULT_BASE_URL
+        if not config.get("model_name"):
+            config["model_name"] = self.DEFAULT_MODEL
+        super().__init__(config)
+
+
 def create_instance(class_name, *args, **kwargs):
     # 获取类对象
     cls = globals().get(class_name)
diff --git a/bailing/tts.py b/bailing/tts.py
index e63ce6c..2b087d3 100644
--- a/bailing/tts.py
+++ b/bailing/tts.py
@@ -8,6 +8,7 @@
 from datetime import datetime
 from gtts import gTTS
 import edge_tts
+import requests
 import ChatTTS
 import torch
 import torchaudio
@@ -295,6 +296,91 @@ def to_tts(self, text: str) -> str:
             return ""
 
 
+class MiniMaxTTS(AbstractTTS):
+    """MiniMax Text-to-Speech provider (T2A v2 HTTP API).
+
+    Converts text to speech via MiniMax's cloud TTS service and writes the
+    resulting audio to a local file. The API returns hex-encoded audio (not
+    base64). Docs: https://platform.minimax.io/docs/api-reference/speech-t2a-http
+
+    config keys:
+      - api_key:     MiniMax API key (shared with the Chat API)
+      - voice:       voice id, e.g. 'English_Graceful_Lady' (default)
+      - model:       TTS model id, e.g. 'speech-2.8-hd' (default)
+      - base_url:    API base url, default 'https://api.minimax.io'
+      - output_file: directory to write audio files to
+      - format:      audio format, default 'mp3'
+      - sample_rate / bitrate / speed / vol / pitch: optional fine tuning
+    """
+
+    def __init__(self, config):
+        self.output_file = config.get("output_file", "tmp/")
+        self.api_key = config.get("api_key")
+        self.voice = config.get("voice", "English_Graceful_Lady")
+        self.model = config.get("model", "speech-2.8-hd")
+        self.base_url = config.get("base_url", "https://api.minimax.io").rstrip("/")
+        self.format = config.get("format", "mp3")
+        self.sample_rate = config.get("sample_rate", 32000)
+        self.bitrate = config.get("bitrate", 128000)
+        self.speed = config.get("speed", 1)
+        self.vol = config.get("vol", 1)
+        self.pitch = config.get("pitch", 0)
+
+    def _generate_filename(self, extension=".mp3"):
+        return os.path.join(self.output_file, f"tts-{datetime.now().date()}@{uuid.uuid4().hex}{extension}")
+
+    def _log_execution_time(self, start_time):
+        end_time = time.time()
+        execution_time = end_time - start_time
+        logger.debug(f"Execution Time: {execution_time:.2f} seconds")
+
+    def to_tts(self, text):
+        tmpfile = self._generate_filename(f".{self.format}")
+        start_time = time.time()
+        payload = {
+            "model": self.model,
+            "text": text,
+            "stream": False,
+            "voice_setting": {
+                "voice_id": self.voice,
+                "speed": self.speed,
+                "vol": self.vol,
+                "pitch": self.pitch,
+            },
+            "audio_setting": {
+                "sample_rate": self.sample_rate,
+                "bitrate": self.bitrate,
+                "format": self.format,
+                "channel": 1,
+            },
+        }
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+        }
+        try:
+            response = requests.post(
+                f"{self.base_url}/v1/t2a_v2", headers=headers, json=payload, timeout=30
+            )
+            response.raise_for_status()
+            result = response.json()
+            base_resp = result.get("base_resp", {})
+            if base_resp.get("status_code") != 0:
+                logger.error(f"MiniMax TTS failed: {base_resp.get('status_msg')}")
+                return None
+            audio_hex = result.get("data", {}).get("audio")
+            if not audio_hex:
+                logger.error("MiniMax TTS returned empty audio")
+                return None
+            with open(tmpfile, "wb") as f:
+                f.write(bytes.fromhex(audio_hex))  # MiniMax returns hex, not base64
+            self._log_execution_time(start_time)
+            return tmpfile
+        except Exception as e:
+            logger.error(f"Failed to generate TTS file: {e}")
+            return None
+
+
 def create_instance(class_name, *args, **kwargs):
     # 获取类对象
     cls = globals().get(class_name)
diff --git a/config/config.yaml b/config/config.yaml
index fc0f232..32ae8fa 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -42,6 +42,10 @@ LLM:
     model_name: deepseek-chat
     url: https://api.deepseek.com
     api_key:
+  MiniMaxLLM:
+    model_name: MiniMax-M3
+    url: https://api.minimax.io/v1
+    api_key:
   OllamaLLM:
     model_name: qwen-chat
     url: http://localhost:11434/api/chat # 注意后缀是api/chat
@@ -67,6 +71,12 @@ TTS:
     lang: z
     voice: zf_001
     repo_id: hexgrad/Kokoro-82M-v1.1-zh
+  MiniMaxTTS:
+    output_file: tmp/
+    api_key:
+    voice: English_Graceful_Lady
+    model: speech-2.8-hd
+    base_url: https://api.minimax.io
 
 Player:
   PygameSoundPlayer: null