Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
73c0cfb
Switched to using Oracle Cloud again
Jun 8, 2026
3057f7f
Updated the raspberry pi config
YakimaProgrammer Jun 8, 2026
fb9a289
feat: switch to local mic capture using sounddevice
YakimaProgrammer Jun 8, 2026
788f75b
refactor: implement asyncio queue-based audio processing for low latency
YakimaProgrammer Jun 8, 2026
11b0565
Revert "refactor: implement asyncio queue-based audio processing for …
YakimaProgrammer Jun 8, 2026
9ca4674
Added some scripts for vosk
YakimaProgrammer Jun 8, 2026
45af64e
[DROP ME] Large model files & build artifacts
YakimaProgrammer Jun 8, 2026
bd21654
Two more imports needed
YakimaProgrammer Jun 9, 2026
a5c06d7
Results: TF-IDF better than embedding model on a speed per accuracy b…
YakimaProgrammer Jun 10, 2026
ad9719b
TF-IDF + regression is also very good
YakimaProgrammer Jun 10, 2026
e3925de
TF-IDF + Regression + Vosk in practice
YakimaProgrammer Jun 10, 2026
841d816
vosk
YakimaProgrammer Jun 10, 2026
15b3657
AI slop
YakimaProgrammer Jun 10, 2026
dc3f04f
ignored secrets.json
YakimaProgrammer Jun 10, 2026
79b66b2
feat: add voice.py to generate short spoken intent mp3s via GPT & gTTS
YakimaProgrammer Jun 10, 2026
862bfa2
moving slop around
YakimaProgrammer Jun 10, 2026
033edf9
voice slop 2
YakimaProgrammer Jun 10, 2026
df69ff2
Revert "voice slop 2"
YakimaProgrammer Jun 10, 2026
da6c9be
File downloader
YakimaProgrammer Jun 10, 2026
2e70ff6
Great, but segfaults
YakimaProgrammer Jun 10, 2026
c618d07
We'll put the buggy code over there, where it can't hurt me!
YakimaProgrammer Jun 10, 2026
846435b
Updated the script to download wav files
YakimaProgrammer Jun 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion raspberry-pi/config.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
[audio]
# The native rate of the microphone
MIC_RATE = 48000
# How many samples per second to collect
SAMPLE_RATE = 16000
MODEL_PATH = "cache/moonshine/tiny-streaming"
Expand All @@ -8,4 +10,4 @@ MODEL_ARCH = 2 #tiny streaming
# [intent]); the Pi just streams transcript lines.

[remote]
url = "ws://localhost:8765/cse481/ws/pi"
url = "wss://api.magnusfulton.com/cse481/ws/pi"
102 changes: 44 additions & 58 deletions raspberry-pi/main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import tomllib
import time
import asyncio
import websockets
import json
import numpy as np
import sounddevice as sd
from moonshine_voice import Transcriber, TranscriptEventListener, ModelArch
from typing import Union, Awaitable, Callable

Expand Down Expand Up @@ -50,64 +50,50 @@ async def worker():
with open("config.toml", "rb") as t:
conf = tomllib.load(t)

print(f"[worker] connecting to {conf['remote']['url']}")
async with asyncio.TaskGroup() as tg, \
websockets.connect(
conf["remote"]["url"],
max_size=None,
ping_interval=20) as ws:

print("[worker] websocket connected")

async def send_transcript(text):
try:
print(f"[worker] send_transcript: sending transcript={text!r}")
await ws.send(json.dumps({
"type": "transcript",
"data": text
}))
print("[worker] send_transcript: send completed")
except websockets.exceptions.ConnectionClosedOK:
# remote closed cleanly; ignore this send
print(
"[worker] send_transcript: ConnectionClosedOK while sending; ignoring")
return
except Exception as exc:
# connection closed or other send error; ignore so pipeline can continue/shutdown gracefully
print(f"[worker] send_transcript: send failed: {exc}")
return
print("[worker] starting local microphone capture")

async def print_transcript(text):
print(f"[worker] transcript: {text!r}")

audio_pipeline = AudioPipeline(conf["audio"], print_transcript)

# The cloud now handles intent detection, so the Pi just streams each
# completed transcript line as the model produces it.
audio_pipeline = AudioPipeline(conf["audio"], send_transcript)

try:
print("[worker] sending register_pi")
await ws.send(json.dumps({"type": "register_pi"}))
print("[worker] register_pi sent")
except websockets.exceptions.ConnectionClosedOK:
print("[worker] connection closed during register; exiting")
return
except Exception as e:
print("[worker] failed to send register:", e)
return

print("[worker] created audio_pipeline")

print("Started!")

async for msg in ws:
if isinstance(msg, (bytes, bytearray)):
samples_i16 = np.frombuffer(msg, dtype=np.int16)
samples_f32 = samples_i16.astype(np.float32) / 32768.0
audio_pipeline.submit_audio_sample(samples_f32)
else:
try:
data = json.loads(msg)
print(f"[worker] text frame: {data}")
except Exception as e:
print(f"[worker] non-json frame: {msg!r} error={e}")
mic_rate = conf["audio"]["MIC_RATE"]
sample_rate = conf["audio"]["SAMPLE_RATE"]
channels = conf["audio"].get("CHANNELS", 1)

def audio_callback(indata, frames, time_info, status):
if status:
print(f"[audio] status: {status}")
samples = indata
if samples.ndim > 1:
samples = samples.mean(axis=1)

# simple resampling when mic rate differs from model SAMPLE_RATE
if mic_rate != sample_rate:
old_len = samples.shape[0]
duration = old_len / mic_rate
new_len = int(round(duration * sample_rate))
if new_len <= 0:
return
t_old = np.linspace(0, duration, num=old_len, endpoint=False)
t_new = np.linspace(0, duration, num=new_len, endpoint=False)
samples = np.interp(t_new, t_old, samples).astype(np.float32)
else:
samples = samples.astype(np.float32)

audio_pipeline.submit_audio_sample(samples)

print(f"[worker] opening InputStream mic_rate={mic_rate} sample_rate={sample_rate}")
try:
with sd.InputStream(samplerate=mic_rate, channels=channels, callback=audio_callback):
print("Started! Press Ctrl-C to stop.")
while True:
await asyncio.sleep(1)
except KeyboardInterrupt:
print("Interrupted, exiting")
except Exception as e:
print(f"[worker] audio stream error: {e}")


if __name__ == "__main__":
asyncio.run(worker())
asyncio.run(worker())
3 changes: 3 additions & 0 deletions vox-tiny/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
vosk-model-small-en-us-0.15/
secrets.json
intents/
181 changes: 181 additions & 0 deletions vox-tiny/assistant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
# pip install sounddevice python-vlc vosk PyAudio scikit-learn

import json
import queue
import pickle

import sounddevice as sd
import subprocess

from vosk import Model
from vosk import KaldiRecognizer

SAMPLE_RATE = 48000
BLOCK_SIZE = 4000

audio_q = queue.Queue()

# --------------------
# load model
# --------------------

with open(
"intent_model.pkl",
"rb"
) as f:
vectorizer, classifier = pickle.load(f)

# --------------------
# load intent names
# --------------------

with open("intent_names.txt") as f:
intent_names = [x.strip() for x in f if x.strip()]
# preserve order and remove duplicates while keeping the first occurrence
intent_names = list(dict.fromkeys(intent_names))

# --------------------
# audio playback
# --------------------

current_player = None


def play_intent(intent):
subprocess.run(
["aplay", f"intents/{intent}.wav"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
# --------------------
# intent classifier
# --------------------


def classify(text):

x = vectorizer.transform(
[text]
)

probs = classifier.predict_proba(
x
)[0]

best_label_index = probs.argmax()

confidence = float(
probs[best_label_index]
)

best_class = classifier.classes_[best_label_index]

# If the classifier's class is an integer index into intent_names, use it.
# Otherwise fall back to the classifier class value as a string.
intent_name = None
try:
label_index = int(best_class)
except Exception:
label_index = None

if label_index is not None and 0 <= label_index < len(intent_names):
intent_name = intent_names[label_index]
else:
# If the class itself is already an intent name, prefer that.
if str(best_class) in intent_names:
intent_name = str(best_class)
else:
intent_name = str(best_class)

return (
intent_name,
confidence
)

# --------------------
# mic callback
# --------------------


def audio_callback(
indata,
frames,
time_info,
status
):
if status:
print(status)

audio_q.put(
bytes(indata)
)

# --------------------
# main
# --------------------


def main():

model = Model(
"vosk-model-small-en-us-0.15"
)

recognizer = KaldiRecognizer(
model,
SAMPLE_RATE
)

with sd.RawInputStream(
samplerate=SAMPLE_RATE,
blocksize=BLOCK_SIZE,
dtype="int16",
channels=1,
callback=audio_callback
):

print(
"Listening..."
)

while True:

data = audio_q.get()

if recognizer.AcceptWaveform(
data
):

result = json.loads(
recognizer.Result()
)

text = result.get(
"text",
""
)

if not text:
continue

intent, score = classify(
text
)

print(
f"{text!r}"
)

print(
f"→ {intent} "
f"({score:.3f})"
)

if score > 0.45:
play_intent(
intent
)


if __name__ == "__main__":
main()
11 changes: 11 additions & 0 deletions vox-tiny/download-files.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#! /bin/bash

rm -rf vosk-model-small-en-us-0.15 intents

wget https://s3.magnusfulton.com/shared/labrador/vosk-model-small-en-us-0.15.zip
unzip vosk-model-small-en-us-0.15.zip
rm vosk-model-small-en-us-0.15.zip

wget https://s3.magnusfulton.com/shared/labrador/intents-spoken-wav.zip
unzip intents-spoken-wav.zip
rm intents-spoken-wav.zip
Binary file added vox-tiny/intent_model.pkl
Binary file not shown.
Loading