diff --git a/backend/api/routes/voice.py b/backend/api/routes/voice.py
index 91593c8..625d3c2 100644
--- a/backend/api/routes/voice.py
+++ b/backend/api/routes/voice.py
@@ -396,20 +396,31 @@ async def device_voice_ws(
 ):
     mac = validate_mac_param(mac)
     x_device_token = websocket.headers.get("x-device-token") or websocket.query_params.get("token")
+
+    # Accept immediately so the client (ESP32) gets HTTP 101 without waiting for DB queries.
+    # Auth and session setup happen right after; if they fail we close with an error code.
+    await websocket.accept()
+    logger.info("[VOICE_WS] accepted mac=%s has_token=%s", mac, bool(x_device_token))
+
     try:
         await require_device_token(mac, x_device_token, websocket.headers.get("accept-language"))
     except HTTPException as exc:
+        logger.warning("[VOICE_WS] auth failed mac=%s status=%d detail=%s", mac, exc.status_code, exc.detail)
         await websocket.close(code=1008, reason=str(exc.detail)[:120])
         return
 
-    settings = await _resolve_device_voice_runtime_settings(mac)
-    session = create_voice_ws_session(
-        settings=settings,
-        access_user_id=None,
-        access_mac=mac,
-    )
-    await websocket.accept()
-    logger.info("[VOICE_WS] accepted mac=%s session_id=%s", mac, session.session_id)
+    try:
+        settings = await _resolve_device_voice_runtime_settings(mac)
+        session = create_voice_ws_session(
+            settings=settings,
+            access_user_id=None,
+            access_mac=mac,
+        )
+        logger.info("[VOICE_WS] session ready mac=%s session_id=%s provider=%s", mac, getattr(session, "session_id", "?"), getattr(settings, "llm_provider", "?"))
+    except Exception:
+        logger.exception("[VOICE_WS] session setup failed mac=%s", mac)
+        await websocket.close(code=1011, reason="internal error")
+        return
 
     async def _sender() -> None:
         async for event in iter_voice_ws_events(session):
@@ -430,6 +441,7 @@ async def _sender() -> None:
                 ws_message.get("type"),
             )
             if ws_message.get("type") == "websocket.disconnect":
+                logger.info("[VOICE_WS] client disconnected mac=%s code=%s", mac, ws_message.get("code"))
                 break
 
             if "bytes" in ws_message and ws_message["bytes"]:
diff --git a/docs/en/voice-mode.md b/docs/en/voice-mode.md
index e5fe448..0b1ace9 100644
Binary files a/docs/en/voice-mode.md and b/docs/en/voice-mode.md differ
diff --git a/docs/voice-mode.md b/docs/voice-mode.md
index c98ab44..4e85b01 100644
Binary files a/docs/voice-mode.md and b/docs/voice-mode.md differ
diff --git a/firmware/src/network.cpp b/firmware/src/network.cpp
index ba6aa6e..b5b7667 100644
--- a/firmware/src/network.cpp
+++ b/firmware/src/network.cpp
@@ -1431,9 +1431,8 @@ bool voiceWsOpen(int sampleRate, int screenW, int screenH, bool includeImage) {
     }
 
     String mac = WiFi.macAddress();
-    String path = basePath + "/api/device/" + mac + "/voice/ws";
+    String path = basePath + "/api/device/" + mac + "/voice/ws?token=" + cfgDeviceToken;
     String extraHeaders = String("X-Device-Token: ") + cfgDeviceToken;
-
     String startMsg = String("{\"type\":\"session.start\",\"sample_rate\":") + sampleRate
                     + ",\"w\":" + screenW
                     + ",\"h\":" + screenH