pollen-robotics · FabienDanieau · May 25, 2026 · Mar 27, 2026 · Mar 27, 2026 · Apr 14, 2026
diff --git a/docs/source/API/openapi.json b/docs/source/API/openapi.json
@@ -1235,6 +1235,52 @@
         }
       }
     },
+    "/api/media/wobbling/enable": {
+      "post": {
+        "summary": "Enable Wobbling",
+        "description": "Enable audio-reactive head wobbling.\n\nWhen enabled, audio played on the daemon (sounds, incoming WebRTC\naudio) is analysed and converted into subtle head movements.",
+        "operationId": "enable_wobbling_api_media_wobbling_enable_post",
+        "responses": {
+          "200": {
+            "description": "Successful Response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "additionalProperties": {
+                    "type": "string"
+                  },
+                  "type": "object",
+                  "title": "Response Enable Wobbling Api Media Wobbling Enable Post"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/api/media/wobbling/disable": {
+      "post": {
+        "summary": "Disable Wobbling",
+        "description": "Disable audio-reactive head wobbling and reset offsets.",
+        "operationId": "disable_wobbling_api_media_wobbling_disable_post",
+        "responses": {
+          "200": {
+            "description": "Successful Response",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "additionalProperties": {
+                    "type": "string"
+                  },
+                  "type": "object",
+                  "title": "Response Disable Wobbling Api Media Wobbling Disable Post"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
     "/api/media/sounds/upload": {
       "post": {
         "summary": "Upload Sound",

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -106,6 +106,8 @@
       title: Sound Playback
     - local: examples/sound_record
       title: Sound Recording
+    - local: examples/sound_tts
+      title: Sound TTS (with head wobbling)
     - local: examples/custom_media_manager
       title: Custom Media Manager
   title: Examples

diff --git a/docs/source/examples/sound_tts.md b/docs/source/examples/sound_tts.md
@@ -0,0 +1,46 @@
+# Sound TTS (with head wobbling)
+
+This example synthesises speech from text via ResembleAI's
+[Chatterbox Multilingual TTS](https://huggingface.co/spaces/ResembleAI/Chatterbox-Multilingual-TTS)
+Hugging Face Space, plays the returned audio on Reachy Mini, and
+wobbles the head in sync with the speech.
+
+Chatterbox supports zero-shot voice cloning: pass a short reference
+audio file and the synthesis matches that voice. 23 languages are
+supported.
+
+**Usage:**
+
+```bash
+# Default English voice
+uv run python examples/sound_tts.py --text "Hello, I can wobble my head!"
+
+# Different language
+uv run python examples/sound_tts.py --text "Bonjour, je suis Reachy Mini" --lang fr
+
+# Clone a voice from a local sample
+uv run python examples/sound_tts.py \
+    --text "Hello world" \
+    --ref-audio ~/Downloads/my_voice.wav
+```
+
+**Options:**
+
+- `--text <str>`: Text to synthesize (max 300 chars per request).
+- `--lang <code>`: ISO 639-1 language code. Supported: `ar`, `da`,
+  `de`, `el`, `en`, `es`, `fi`, `fr`, `he`, `hi`, `it`, `ja`, `ko`,
+  `ms`, `nl`, `no`, `pl`, `pt`, `ru`, `sv`, `sw`, `tr`, `zh`.
+- `--ref-audio <path|url>`: Reference audio for zero-shot voice
+  cloning. Local paths and URLs both work; defaults to a Gradio
+  sample voice.
+
+Synthesis runs on the Space's shared GPU and typically takes
+60–90 s per sentence.
+
+<literalinclude>
+{"path": "../../../examples/sound_tts.py",
+"language": "python",
+"start-after": "START doc_example",
+"end-before": "END doc_example"
+}
+</literalinclude>
diff --git a/examples/sound_play.py b/examples/sound_play.py
@@ -51,9 +51,13 @@ def play_live_tone(mini: "ReachyMini", tone_hz: float) -> None:
         mini.media.stop_playing()
 
 
-def main(backend: str, wav_path: str | None, tone_hz: float) -> None:
+def main(
+    backend: str, wav_path: str | None, tone_hz: float, wobbling: bool = False
+) -> None:
     """Run the sound playback example."""
     with ReachyMini(log_level="DEBUG", media_backend=backend) as mini:
+        if wobbling:
+            mini.enable_wobbling()
         if wav_path:
             play_wav(mini, wav_path)
         else:
@@ -90,8 +94,18 @@ def main(backend: str, wav_path: str | None, tone_hz: float) -> None:
         type=float,
         help="Sine wave frequency in Hz (--live mode only).",
     )
+    parser.add_argument(
+        "--wobbling",
+        action="store_true",
+        help="Enable audio-reactive head wobbling.",
+    )
 
     args = parser.parse_args()
-    main(backend=args.backend, wav_path=args.wav, tone_hz=args.tone_hz)
+    main(
+        backend=args.backend,
+        wav_path=args.wav,
+        tone_hz=args.tone_hz,
+        wobbling=args.wobbling,
+    )
 
 # END doc_example
diff --git a/examples/sound_tts.py b/examples/sound_tts.py
@@ -0,0 +1,108 @@
+"""TTS demo with head wobbling.
+
+Sends text to ResembleAI's Chatterbox Multilingual TTS Hugging Face
+Space (zero-shot voice cloning, 23 languages), plays the returned
+audio on Reachy Mini, and wobbles the head in sync.
+
+Usage::
+
+    uv run python examples/sound_tts.py --text "Hello world"
+    uv run python examples/sound_tts.py --text "Bonjour" --lang fr
+    uv run python examples/sound_tts.py --text "..." --ref-audio /path/to/voice.wav
+
+Browse the Space: https://huggingface.co/spaces/ResembleAI/Chatterbox-Multilingual-TTS
+"""
+
+# START doc_example
+
+import argparse
+import os
+import time
+
+import gi
+from gradio_client import Client, handle_file
+
+gi.require_version("Gst", "1.0")
+gi.require_version("GstPbutils", "1.0")
+from gi.repository import Gst, GstPbutils  # noqa: E402
+
+from reachy_mini import ReachyMini  # noqa: E402
+
+HF_SPACE = "ResembleAI/Chatterbox-Multilingual-TTS"
+LANGUAGES = [
+    "ar", "da", "de", "el", "en", "es", "fi", "fr", "he", "hi", "it",
+    "ja", "ko", "ms", "nl", "no", "pl", "pt", "ru", "sv", "sw", "tr", "zh",
+]
+DEFAULT_REF_AUDIO = (
+    "https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav"
+)
+
+
+def synthesize(text: str, lang: str, ref_audio: str) -> str:
+    """Submit *text* to Chatterbox; return a path to a local audio file."""
+    if not ref_audio.startswith(("http://", "https://")):
+        ref_audio = os.path.expanduser(ref_audio)
+    client = Client(HF_SPACE)
+    audio_path = client.predict(
+        text_input=text,
+        language_id=lang,
+        audio_prompt_path_input=handle_file(ref_audio),
+        api_name="/generate_tts_audio",
+    )
+    return str(audio_path)
+
+
+def probe_duration_s(path: str) -> float:
+    """Return the media duration of *path* in seconds via GStreamer."""
+    Gst.init([])
+    disc = GstPbutils.Discoverer.new(10 * Gst.SECOND)
+    info = disc.discover_uri(f"file://{path}")
+    return float(info.get_duration() / Gst.SECOND)
+
+
+def main(text: str, lang: str, ref_audio: str) -> None:
+    """Synthesize *text*, play it on Reachy Mini with wobbling enabled."""
+    print(f"Synthesizing {len(text)} chars ({lang}) with Chatterbox...")
+    audio_path = synthesize(text, lang, ref_audio)
+    duration = probe_duration_s(audio_path)
+    print(f"Got {audio_path} ({duration:.1f}s)")
+
+    with ReachyMini(log_level="INFO") as mini:
+        mini.enable_wobbling()
+        mini.media.play_sound(audio_path)
+        time.sleep(duration + 0.5)
+        mini.disable_wobbling()
+    print("Done.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Chatterbox Multilingual TTS + head wobbler demo.",
+    )
+    parser.add_argument(
+        "--text",
+        type=str,
+        default="Hello, I am Reachy Mini. Let me wobble my head while I speak.",
+        help="Text to synthesize (max 300 chars per request).",
+    )
+    parser.add_argument(
+        "--lang",
+        type=str,
+        default="en",
+        choices=LANGUAGES,
+        help="Language code (ISO 639-1).",
+    )
+    parser.add_argument(
+        "--ref-audio",
+        type=str,
+        default=DEFAULT_REF_AUDIO,
+        help="Reference audio (URL or local path) for zero-shot voice cloning.",
+    )
+    args = parser.parse_args()
+    main(
+        text=args.text,
+        lang=args.lang,
+        ref_audio=args.ref_audio,
+    )
+
+# END doc_example
diff --git a/pyproject.toml b/pyproject.toml
@@ -53,6 +53,7 @@ examples = [
     "soundfile",
     "opencv-python<=5.0",
     "cv2_enumerate_cameras>=1.2.1",
+    "gradio-client"
 ]
 mujoco = ["mujoco==3.3.0"]
 nn_kinematics = ["onnxruntime==1.22.1"]
@@ -73,7 +74,7 @@ all = [
     "reachy_mini[placo_kinematics]",
     "reachy_mini[rerun]",
     "reachy_mini[wireless-version]",
-    "reachy_mini[opencv]",
+    "reachy_mini[opencv]"
 ]
 
 [dependency-groups]

diff --git a/src/reachy_mini/daemon/app/routers/media.py b/src/reachy_mini/daemon/app/routers/media.py
@@ -95,6 +95,39 @@ async def stop_sound(
     return {"status": "ok"}
 
 
+@router.post("/wobbling/enable")
+async def enable_wobbling(
+    daemon: Daemon = Depends(get_daemon),
+) -> dict[str, str]:
+    """Enable audio-reactive head wobbling.
+
+    When enabled, audio played on the daemon (sounds, incoming WebRTC
+    audio) is analysed and converted into subtle head movements.
+    """
+    backend = daemon.backend
+    if backend is None or not backend.ready.is_set():
+        raise HTTPException(status_code=503, detail="Backend not running")
+
+    if backend._media_server is not None:
+        backend._media_server.enable_wobbling(backend.set_speech_offsets)
+    return {"status": "ok"}
+
+
+@router.post("/wobbling/disable")
+async def disable_wobbling(
+    daemon: Daemon = Depends(get_daemon),
+) -> dict[str, str]:
+    """Disable audio-reactive head wobbling and reset offsets."""
+    backend = daemon.backend
+    if backend is None or not backend.ready.is_set():
+        raise HTTPException(status_code=503, detail="Backend not running")
+
+    if backend._media_server is not None:
+        backend._media_server.disable_wobbling()
+    backend.set_speech_offsets((0.0, 0.0, 0.0, 0.0, 0.0, 0.0))
+    return {"status": "ok"}
+
+
 @router.post("/sounds/upload")
 async def upload_sound(
     file: UploadFile = File(...),