datascale-ai · AeBoPi · May 18, 2026 · May 18, 2026
diff --git a/backend/api/routes/voice.py b/backend/api/routes/voice.py
@@ -396,20 +396,31 @@ async def device_voice_ws(
 ):
     mac = validate_mac_param(mac)
     x_device_token = websocket.headers.get("x-device-token") or websocket.query_params.get("token")
+
+    # Accept immediately so the client (ESP32) gets HTTP 101 without waiting for DB queries.
+    # Auth and session setup happen right after; if they fail we close with an error code.
+    await websocket.accept()
+    logger.info("[VOICE_WS] accepted mac=%s has_token=%s", mac, bool(x_device_token))
+
     try:
         await require_device_token(mac, x_device_token, websocket.headers.get("accept-language"))
     except HTTPException as exc:
+        logger.warning("[VOICE_WS] auth failed mac=%s status=%d detail=%s", mac, exc.status_code, exc.detail)
         await websocket.close(code=1008, reason=str(exc.detail)[:120])
         return
 
-    settings = await _resolve_device_voice_runtime_settings(mac)
-    session = create_voice_ws_session(
-        settings=settings,
-        access_user_id=None,
-        access_mac=mac,
-    )
-    await websocket.accept()
-    logger.info("[VOICE_WS] accepted mac=%s session_id=%s", mac, session.session_id)
+    try:
+        settings = await _resolve_device_voice_runtime_settings(mac)
+        session = create_voice_ws_session(
+            settings=settings,
+            access_user_id=None,
+            access_mac=mac,
+        )
+        logger.info("[VOICE_WS] session ready mac=%s session_id=%s provider=%s", mac, getattr(session, "session_id", "?"), getattr(settings, "llm_provider", "?"))
+    except Exception:
+        logger.exception("[VOICE_WS] session setup failed mac=%s", mac)
+        await websocket.close(code=1011, reason="internal error")
+        return
 
     async def _sender() -> None:
         async for event in iter_voice_ws_events(session):
@@ -430,6 +441,7 @@ async def _sender() -> None:
                 ws_message.get("type"),
             )
             if ws_message.get("type") == "websocket.disconnect":
+                logger.info("[VOICE_WS] client disconnected mac=%s code=%s", mac, ws_message.get("code"))
                 break
 
             if "bytes" in ws_message and ws_message["bytes"]:

diff --git a/docs/en/voice-mode.md b/docs/en/voice-mode.md
diff --git a/docs/voice-mode.md b/docs/voice-mode.md
diff --git a/firmware/src/network.cpp b/firmware/src/network.cpp
@@ -1431,9 +1431,8 @@ bool voiceWsOpen(int sampleRate, int screenW, int screenH, bool includeImage) {
     }
 
     String mac = WiFi.macAddress();
-    String path = basePath + "/api/device/" + mac + "/voice/ws";
+    String path = basePath + "/api/device/" + mac + "/voice/ws?token=" + cfgDeviceToken;
     String extraHeaders = String("X-Device-Token: ") + cfgDeviceToken;
-
     String startMsg = String("{\"type\":\"session.start\",\"sample_rate\":") + sampleRate
                     + ",\"w\":" + screenW
                     + ",\"h\":" + screenH
Mode	Description	Use Case
Button Interaction	User presses the device button to speak; after releasing, the device sends audio to the backend and plays the AI response	Physical device with microphone and speaker
HTTP Request	Upload PCM audio via HTTP POST to get recognized text, response text, response audio, and conversation image	Limited device capability or web integration
Module	Role	Interface	Description
INMP441	Microphone (I2S input)	I2S	Omnidirectional MEMS microphone, 16kHz sampling, I2S interface
MAX98357A	Speaker amplifier (I2S output)	I2S	Class D amplifier module, directly drives 3W 8Ω speaker
INMP441 Pin	ESP32-WROOM-32E GPIO	Description
VCC	3.3V	Power supply
GND	GND	Ground
SCK	GPIO 18	I2S clock
WS	GPIO 19	I2S word select
SD	GPIO 33	I2S data output
MAX98357A Pin	ESP32-WROOM-32E GPIO	Description
VCC	3.3V	Power supply (MAX98357A supports 3.3V-5V)
GND	Do not connect	Connecting will cause no sound detection
BCK (DIN/BCLK)	GPIO 17	I2S bit clock
LCK (WS/FS)	GPIO 16	I2S word select
DIN (DOUT)	GPIO 22	I2S data input
Function	ESP32-WROOM-32E GPIO	Description
One side of button	GPIO 23	Long press 3 seconds to enter AI Chat Mode
Other side of button	GND	Short press to enter network pairing mode
Macro	Default Value	Description
`SAMPLE_RATE`	16000	Sample rate (Hz), must be 16kHz
`ENABLE_OPUS`	Optional	Enable Opus codec (saves bandwidth, requires backend support)
`AI_CHAT_BTN_HOLD_MS`	3000	Duration for long press to trigger AI chat (milliseconds)
Parameter	Value	Description
Sample rate	16kHz	Used by both STT and TTS
Microphone bit depth	32-bit I2S	INMP441 outputs 24-bit, firmware takes high 16-bit
Speaker bit depth	16-bit	MAX98357A requires 16-bit stereo input
Opus frame length	60ms	When Opus is enabled, each frame is 60ms (960 samples)
I2S DMA buffer	240 samples	Input/output DMA buffer size
Trigger Words	Target Mode	Example
Weather, Weather Dashboard	WEATHER	"Switch to weather mode"
Calendar, Monthly Calendar	CALENDAR	"Switch to calendar mode"
Daily Recommendation, Daily	DAILY	"Switch to daily recommendation mode"
模式	说明	适用场景
按键交互	用户按设备按键说话，松开后设备将音频发送到后端，等待 AI 回复后播放	物理设备有麦克风和扬声器
HTTP 请求	通过 HTTP POST 上传 PCM 音频，获取识别文本、回复文本、回复音频和对话图片	设备能力有限或需要 Web 端集成
模块	角色	接口	说明
INMP441	麦克风（I2S 输入）	I2S	全向 MEMS 麦克风，16kHz 采样，I2S 接口
MAX98357A	扬声器功放（I2S 输出）	I2S	D 类功放模块，直推 3W 8Ω 喇叭