aliyahmcrae · YakimaProgrammer · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026
diff --git a/raspberry-pi/config.toml b/raspberry-pi/config.toml
@@ -1,4 +1,6 @@
 [audio]
+# The native rate of the microphone
+MIC_RATE = 48000
 # How many samples per second to collect
 SAMPLE_RATE = 16000
 MODEL_PATH = "cache/moonshine/tiny-streaming"
@@ -8,4 +10,4 @@ MODEL_ARCH = 2 #tiny streaming
 # [intent]); the Pi just streams transcript lines.
 
 [remote]
-url = "ws://localhost:8765/cse481/ws/pi"
+url = "wss://api.magnusfulton.com/cse481/ws/pi"
diff --git a/raspberry-pi/main.py b/raspberry-pi/main.py
@@ -1,9 +1,9 @@
 import tomllib
 import time
 import asyncio
-import websockets
 import json
 import numpy as np
+import sounddevice as sd
 from moonshine_voice import Transcriber, TranscriptEventListener, ModelArch
 from typing import Union, Awaitable, Callable
 
@@ -50,64 +50,50 @@ async def worker():
     with open("config.toml", "rb") as t:
         conf = tomllib.load(t)
 
-    print(f"[worker] connecting to {conf['remote']['url']}")
-    async with asyncio.TaskGroup() as tg, \
-            websockets.connect(
-                conf["remote"]["url"],
-                max_size=None,
-                ping_interval=20) as ws:
-
-        print("[worker] websocket connected")
-
-        async def send_transcript(text):
-            try:
-                print(f"[worker] send_transcript: sending transcript={text!r}")
-                await ws.send(json.dumps({
-                    "type": "transcript",
-                    "data": text
-                }))
-                print("[worker] send_transcript: send completed")
-            except websockets.exceptions.ConnectionClosedOK:
-                # remote closed cleanly; ignore this send
-                print(
-                    "[worker] send_transcript: ConnectionClosedOK while sending; ignoring")
-                return
-            except Exception as exc:
-                # connection closed or other send error; ignore so pipeline can continue/shutdown gracefully
-                print(f"[worker] send_transcript: send failed: {exc}")
-                return
+    print("[worker] starting local microphone capture")
+
+    async def print_transcript(text):
+        print(f"[worker] transcript: {text!r}")
+
+    audio_pipeline = AudioPipeline(conf["audio"], print_transcript)
 
-        # The cloud now handles intent detection, so the Pi just streams each
-        # completed transcript line as the model produces it.
-        audio_pipeline = AudioPipeline(conf["audio"], send_transcript)
-
-        try:
-            print("[worker] sending register_pi")
-            await ws.send(json.dumps({"type": "register_pi"}))
-            print("[worker] register_pi sent")
-        except websockets.exceptions.ConnectionClosedOK:
-            print("[worker] connection closed during register; exiting")
-            return
-        except Exception as e:
-            print("[worker] failed to send register:", e)
-            return
-
-        print("[worker] created audio_pipeline")
-
-        print("Started!")
-
-        async for msg in ws:
-            if isinstance(msg, (bytes, bytearray)):
-                samples_i16 = np.frombuffer(msg, dtype=np.int16)
-                samples_f32 = samples_i16.astype(np.float32) / 32768.0
-                audio_pipeline.submit_audio_sample(samples_f32)
-            else:
-                try:
-                    data = json.loads(msg)
-                    print(f"[worker] text frame: {data}")
-                except Exception as e:
-                    print(f"[worker] non-json frame: {msg!r} error={e}")
+    mic_rate = conf["audio"]["MIC_RATE"]
+    sample_rate = conf["audio"]["SAMPLE_RATE"]
+    channels = conf["audio"].get("CHANNELS", 1)
+
+    def audio_callback(indata, frames, time_info, status):
+        if status:
+            print(f"[audio] status: {status}")
+        samples = indata
+        if samples.ndim > 1:
+            samples = samples.mean(axis=1)
+
+        # simple resampling when mic rate differs from model SAMPLE_RATE
+        if mic_rate != sample_rate:
+            old_len = samples.shape[0]
+            duration = old_len / mic_rate
+            new_len = int(round(duration * sample_rate))
+            if new_len <= 0:
+                return
+            t_old = np.linspace(0, duration, num=old_len, endpoint=False)
+            t_new = np.linspace(0, duration, num=new_len, endpoint=False)
+            samples = np.interp(t_new, t_old, samples).astype(np.float32)
+        else:
+            samples = samples.astype(np.float32)
+
+        audio_pipeline.submit_audio_sample(samples)
+
+    print(f"[worker] opening InputStream mic_rate={mic_rate} sample_rate={sample_rate}")
+    try:
+        with sd.InputStream(samplerate=mic_rate, channels=channels, callback=audio_callback):
+            print("Started! Press Ctrl-C to stop.")
+            while True:
+                await asyncio.sleep(1)
+    except KeyboardInterrupt:
+        print("Interrupted, exiting")
+    except Exception as e:
+        print(f"[worker] audio stream error: {e}")
 
 
 if __name__ == "__main__":
-    asyncio.run(worker())
+    asyncio.run(worker())
diff --git a/vox-tiny/.gitignore b/vox-tiny/.gitignore
@@ -0,0 +1,3 @@
+vosk-model-small-en-us-0.15/
+secrets.json
+intents/
diff --git a/vox-tiny/assistant.py b/vox-tiny/assistant.py
@@ -0,0 +1,181 @@
+# pip install sounddevice python-vlc vosk PyAudio scikit-learn
+
+import json
+import queue
+import pickle
+
+import sounddevice as sd
+import subprocess
+
+from vosk import Model
+from vosk import KaldiRecognizer
+
+SAMPLE_RATE = 48000
+BLOCK_SIZE = 4000
+
+audio_q = queue.Queue()
+
+# --------------------
+# load model
+# --------------------
+
+with open(
+    "intent_model.pkl",
+    "rb"
+) as f:
+    vectorizer, classifier = pickle.load(f)
+
+# --------------------
+# load intent names
+# --------------------
+
+with open("intent_names.txt") as f:
+    intent_names = [x.strip() for x in f if x.strip()]
+    # preserve order and remove duplicates while keeping the first occurrence
+    intent_names = list(dict.fromkeys(intent_names))
+
+# --------------------
+# audio playback
+# --------------------
+
+current_player = None
+
+
+def play_intent(intent):
+    subprocess.run(
+        ["aplay", f"intents/{intent}.wav"],
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )    
+# --------------------
+# intent classifier
+# --------------------
+
+
+def classify(text):
+
+    x = vectorizer.transform(
+        [text]
+    )
+
+    probs = classifier.predict_proba(
+        x
+    )[0]
+
+    best_label_index = probs.argmax()
+
+    confidence = float(
+        probs[best_label_index]
+    )
+
+    best_class = classifier.classes_[best_label_index]
+
+    # If the classifier's class is an integer index into intent_names, use it.
+    # Otherwise fall back to the classifier class value as a string.
+    intent_name = None
+    try:
+        label_index = int(best_class)
+    except Exception:
+        label_index = None
+
+    if label_index is not None and 0 <= label_index < len(intent_names):
+        intent_name = intent_names[label_index]
+    else:
+        # If the class itself is already an intent name, prefer that.
+        if str(best_class) in intent_names:
+            intent_name = str(best_class)
+        else:
+            intent_name = str(best_class)
+
+    return (
+        intent_name,
+        confidence
+    )
+
+# --------------------
+# mic callback
+# --------------------
+
+
+def audio_callback(
+    indata,
+    frames,
+    time_info,
+    status
+):
+    if status:
+        print(status)
+
+    audio_q.put(
+        bytes(indata)
+    )
+
+# --------------------
+# main
+# --------------------
+
+
+def main():
+
+    model = Model(
+        "vosk-model-small-en-us-0.15"
+    )
+
+    recognizer = KaldiRecognizer(
+        model,
+        SAMPLE_RATE
+    )
+
+    with sd.RawInputStream(
+        samplerate=SAMPLE_RATE,
+        blocksize=BLOCK_SIZE,
+        dtype="int16",
+        channels=1,
+        callback=audio_callback
+    ):
+
+        print(
+            "Listening..."
+        )
+
+        while True:
+
+            data = audio_q.get()
+
+            if recognizer.AcceptWaveform(
+                data
+            ):
+
+                result = json.loads(
+                    recognizer.Result()
+                )
+
+                text = result.get(
+                    "text",
+                    ""
+                )
+
+                if not text:
+                    continue
+
+                intent, score = classify(
+                    text
+                )
+
+                print(
+                    f"{text!r}"
+                )
+
+                print(
+                    f"→ {intent} "
+                    f"({score:.3f})"
+                )
+
+                if score > 0.45:
+                    play_intent(
+                        intent
+                    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vox-tiny/download-files.sh b/vox-tiny/download-files.sh
@@ -0,0 +1,11 @@
+#! /bin/bash
+
+rm -rf vosk-model-small-en-us-0.15 intents
+
+wget https://s3.magnusfulton.com/shared/labrador/vosk-model-small-en-us-0.15.zip
+unzip vosk-model-small-en-us-0.15.zip
+rm vosk-model-small-en-us-0.15.zip
+
+wget https://s3.magnusfulton.com/shared/labrador/intents-spoken-wav.zip
+unzip intents-spoken-wav.zip
+rm intents-spoken-wav.zip
diff --git a/vox-tiny/intent_model.pkl b/vox-tiny/intent_model.pkl