csheaff · csheaff · Feb 21, 2026 · Feb 15, 2026 · Feb 15, 2026 · Feb 15, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -30,7 +30,7 @@ jobs:
         run: sudo apt-get update && sudo apt-get install -y bats
 
       - name: Install test dependencies
-        run: sudo apt-get install -y ydotool pipewire libnotify-bin socat
+        run: sudo apt-get install -y wtype ydotool pipewire libnotify-bin socat
 
       - name: Run tests
         run: bats test/talktype.bats test/server.bats
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,71 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## What is talktype
+
+Push-to-talk speech-to-text for Linux. Press a hotkey to record, press again to transcribe and type at cursor. No GUI — just a keyboard shortcut bound to the `talktype` script. Works on Wayland (GNOME, Sway, Hyprland) and X11.
+
+## Build and install
+
+```bash
+make install      # Full setup: system deps + Python venv + symlink to ~/.local/bin/talktype
+make deps         # System packages only (requires sudo): ydotool, ffmpeg, pipewire, etc.
+make venv         # Python venv with faster-whisper only
+make parakeet     # Install Parakeet backend venv (in backends/.parakeet-venv/)
+make moonshine    # Install Moonshine backend venv (in backends/.moonshine-venv/)
+make model        # Pre-download Whisper model
+make clean        # Remove .venv
+make uninstall    # Remove ~/.local/bin/talktype symlink
+```
+
+## Testing
+
+Tests use [BATS](https://github.com/bats-core/bats-core) (Bash Automated Testing System):
+
+```bash
+make test                    # Run all tests
+bats test/talktype.bats      # Core tests (recording lifecycle, transcription, error handling)
+bats test/server.bats        # Server mode tests (daemon lifecycle, socket communication)
+bats test/backends.bats      # Integration tests against real backends + NASA audio fixture
+```
+
+Tests use mocks in `test/mocks/` to avoid requiring actual GPU, models, or system tools. The mock daemon (`test/mock-daemon.py`) simulates server backends.
+
+## Linting
+
+CI runs ShellCheck on all Bash scripts and Python syntax checks on all Python files:
+
+```bash
+shellcheck talktype transcribe-server backends/*-server
+python3 -m py_compile transcribe whisper-daemon.py backends/*-daemon.py
+```
+
+## Architecture
+
+**Core flow:** hotkey → `talktype` (Bash) → record audio (ffmpeg/pw-record) → call `$TALKTYPE_CMD` with WAV path → type result via `type_text` (wtype/ydotool/xdotool).
+
+**Main script** (`talktype`, ~160 lines Bash): manages recording state via PID file (`$TALKTYPE_DIR/rec.pid`), sends desktop notifications, delegates transcription to `$TALKTYPE_CMD`.
+
+**Backend pattern — two modes per backend:**
+- **Direct invocation** (`transcribe`, `backends/parakeet`, `backends/moonshine`): Python scripts that load model, transcribe, exit. Simple but slow (model reload each time).
+- **Server mode** (`transcribe-server`, `backends/*-server` + `*-daemon.py`): Bash wrapper manages a Python Unix socket daemon that keeps the model in memory. Subcommands: `start`, `stop`, `transcribe`. Auto-starts daemon if not running.
+
+**Adding a custom backend:** Any executable that takes a WAV file path as its last argument and prints text to stdout. Set `TALKTYPE_CMD` in config.
+
+## Configuration
+
+Config file: `~/.config/talktype/config` (sourced as shell script by `talktype`). Key variables:
+
+- `TALKTYPE_CMD` — transcription command (default: direct faster-whisper via `transcribe`)
+- `TALKTYPE_VENV` — Python venv path (default: `.venv` in script dir)
+- `TALKTYPE_DIR` — runtime dir for PID/audio files (default: `$XDG_RUNTIME_DIR/talktype`)
+- `TALKTYPE_TYPE_CMD` — typing tool (`auto`, `wtype`, `ydotool`, `xdotool`, or custom command; default: `auto`)
+- `WHISPER_MODEL`, `WHISPER_LANG`, `WHISPER_DEVICE`, `WHISPER_COMPUTE` — Whisper settings
+
+## Key conventions
+
+- Core is intentionally pure Bash. Python is only used for ML model invocation.
+- Follows Unix philosophy: small scripts, stdin/stdout interfaces, pluggable components.
+- Server daemons communicate via Unix sockets using `socat`.
+- State files (PID, audio, notification ID) live in `$TALKTYPE_DIR` (XDG runtime dir).
diff --git a/Makefile b/Makefile
@@ -12,7 +12,7 @@ install: deps venv
 
 # Install system dependencies (requires sudo)
 deps:
-	sudo apt install -y ydotool ffmpeg pipewire libnotify-bin python3-venv socat
+	sudo apt install -y wtype xdotool ydotool ffmpeg pipewire libnotify-bin python3-venv socat
 
 # Create Python venv with faster-whisper (default backend)
 venv: .venv/.done

diff --git a/README.md b/README.md
@@ -6,12 +6,11 @@ app to keep running — just a keyboard shortcut.
 
 - **Pluggable backends** — swap transcription models without changing anything else
 - **Works everywhere** — GNOME, Sway, Hyprland, i3, X11
-- **~100 lines of bash** — easy to read, easy to hack on
+- **~160 lines of bash** — easy to read, easy to hack on
 
 Ships with [faster-whisper](https://github.com/SYSTRAN/faster-whisper) by
-default, plus optional [Parakeet](https://huggingface.co/nvidia/parakeet-ctc-1.1b)
-and [Moonshine](https://huggingface.co/UsefulSensors/moonshine-base) backends.
-Or bring your own — anything that reads a WAV and prints text works.
+default, plus an optional [Moonshine](https://huggingface.co/UsefulSensors/moonshine-base)
+backend for CPU. Or bring your own — anything that reads a WAV and prints text works.
 
 > **Note:** This project is in early development — expect rough edges. If you
 > run into issues, please [open a bug](https://github.com/csheaff/talktype/issues).
@@ -20,8 +19,11 @@ Or bring your own — anything that reads a WAV and prints text works.
 
 - Linux (Wayland or X11)
 - Audio recorder: [ffmpeg](https://ffmpeg.org/) (preferred) or PipeWire (`pw-record`)
-- [ydotool](https://github.com/ReimuNotMoe/ydotool) for typing text
-  (user must be in the `input` group — see Install)
+- Typing tool (auto-detected, best available is used):
+  - [wtype](https://github.com/atx/wtype) — Wayland (Sway, Hyprland; not GNOME)
+  - [ydotool](https://github.com/ReimuNotMoe/ydotool) + `ydotoold` — Wayland & X11 (preferred with daemon)
+  - [xdotool](https://github.com/jordansissel/xdotool) — X11 only (not Wayland)
+  - ydotool without daemon — last resort, with warning
 - [socat](https://linux.die.net/man/1/socat) (for server-backed transcription)
 
 For the default backend (faster-whisper):
@@ -36,12 +38,14 @@ make install
 ```
 
 This will:
-1. Install system packages (`ydotool`, etc.)
+1. Install system packages (`wtype`, `ydotool`, etc.)
 2. Create a Python venv with `faster-whisper`
 3. Symlink `talktype` into `~/.local/bin/`
 
 ### ydotool permissions
 
+> **Note:** Only needed if you use ydotool. If you use wtype (Wayland) or xdotool (X11), skip this.
+
 `ydotool` needs access to `/dev/uinput`. Add yourself to the `input` group:
 
 ```bash
@@ -74,6 +78,10 @@ EOF
 Any `TALKTYPE_*` variable can go in this file. Environment variables still work
 and are applied after the config file, so they override it.
 
+Set `TALKTYPE_TYPE_CMD` to control which typing tool is used (`auto`, `wtype`,
+`ydotool`, `xdotool`, or any custom command). Default is `auto`, which picks
+the best available tool: wtype (Wayland) → ydotool+daemon → xdotool (X11).
+
 ## Setup
 
 Bind `talktype` to a keyboard shortcut:
@@ -96,8 +104,8 @@ bindsym $mod+d exec talktype
 
 ## Backends
 
-Three backends are included. Server backends auto-start on first use — the
-model loads once and stays in memory for fast subsequent transcriptions.
+Server backends auto-start on first use — the model loads once and stays in
+memory for fast subsequent transcriptions.
 
 ### Whisper (default)
 
@@ -118,21 +126,6 @@ TALKTYPE_CMD="/path/to/talktype/transcribe-server transcribe"
 | `WHISPER_DEVICE` | `cuda` | `cuda` or `cpu` |
 | `WHISPER_COMPUTE` | `float16` | `float16` (GPU), `int8` or `float32` (CPU) |
 
-### Parakeet (GPU, best word accuracy)
-
-[NVIDIA Parakeet CTC 1.1B](https://huggingface.co/nvidia/parakeet-ctc-1.1b)
-via HuggingFace Transformers. 1.1B params, excellent word accuracy.
-Note: CTC model — outputs lowercase text without punctuation.
-
-```bash
-make parakeet
-```
-
-```bash
-# ~/.config/talktype/config
-TALKTYPE_CMD="/path/to/talktype/backends/parakeet-server transcribe"
-```
-
 ### Moonshine (CPU, lightweight)
 
 [Moonshine](https://huggingface.co/UsefulSensors/moonshine-base) by Useful
@@ -150,14 +143,14 @@ TALKTYPE_CMD="/path/to/talktype/backends/moonshine-server transcribe"
 Set `MOONSHINE_MODEL=UsefulSensors/moonshine-tiny` for an even smaller 27M
 param model.
 
-### Manual server management
+### Server management
 
 The server starts automatically on first transcription. You can also manage
 it directly:
 
 ```bash
-./backends/parakeet-server start   # start manually
-./backends/parakeet-server stop    # stop the server
+./transcribe-server start   # start manually
+./transcribe-server stop    # stop the server
 ```
 
 ### Custom backends
@@ -182,10 +175,10 @@ contract — use whatever model, language, or runtime you want.
                                             ↓
                                      $TALKTYPE_CMD audio.wav
                                             ↓
-                                     ydotool type → text appears at cursor
+                                     type_text → text appears at cursor
 ```
 
-The `talktype` script is ~80 lines of bash. Transcription backends are
+The `talktype` script is ~160 lines of bash. Transcription backends are
 swappable. Server mode uses Unix sockets to keep models in memory.
 
 ## License

diff --git a/backends/moonshine-server b/backends/moonshine-server
@@ -8,6 +8,12 @@
 #   TALKTYPE_CMD="backends/moonshine-server transcribe" talktype
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+
+# Source user config so env vars are available even when invoked directly.
+TALKTYPE_CONFIG="${TALKTYPE_CONFIG:-${XDG_CONFIG_HOME:-$HOME/.config}/talktype/config}"
+# shellcheck disable=SC1090
+[ -f "$TALKTYPE_CONFIG" ] && source "$TALKTYPE_CONFIG"
+
 VENV="$SCRIPT_DIR/.moonshine-venv"
 SOCK="${XDG_RUNTIME_DIR:-/tmp}/moonshine.sock"
 PIDFILE="${XDG_RUNTIME_DIR:-/tmp}/moonshine-server.pid"
@@ -29,7 +35,7 @@ case "${1:-}" in
         PID=$!
         disown "$PID"
         echo "$PID" > "$PIDFILE"
-        for i in $(seq 1 30); do
+        for _ in $(seq 1 60); do
             [ -S "$SOCK" ] && break
             sleep 1
         done
@@ -50,10 +56,13 @@ case "${1:-}" in
         fi
         ;;
     transcribe)
+        if [ -S "$SOCK" ] && [ -f "$PIDFILE" ] && ! kill -0 "$(cat "$PIDFILE")" 2>/dev/null; then
+            rm -f "$PIDFILE" "$SOCK"
+        fi
         if [ ! -S "$SOCK" ]; then
             "$0" start >&2 || exit 1
         fi
-        echo "$2" | socat - UNIX-CONNECT:"$SOCK"
+        echo "$2" | socat -T 30 - UNIX-CONNECT:"$SOCK"
         ;;
     *)
         echo "Usage: moonshine-server {start|stop|transcribe <audio.wav>}" >&2

diff --git a/backends/parakeet-server b/backends/parakeet-server
@@ -8,6 +8,12 @@
 #   TALKTYPE_CMD="backends/parakeet-server transcribe" talktype
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+
+# Source user config so env vars are available even when invoked directly.
+TALKTYPE_CONFIG="${TALKTYPE_CONFIG:-${XDG_CONFIG_HOME:-$HOME/.config}/talktype/config}"
+# shellcheck disable=SC1090
+[ -f "$TALKTYPE_CONFIG" ] && source "$TALKTYPE_CONFIG"
+
 VENV="$SCRIPT_DIR/.parakeet-venv"
 SOCK="${XDG_RUNTIME_DIR:-/tmp}/parakeet.sock"
 PIDFILE="${XDG_RUNTIME_DIR:-/tmp}/parakeet-server.pid"
@@ -28,7 +34,7 @@ case "${1:-}" in
         PID=$!
         disown "$PID"
         echo "$PID" > "$PIDFILE"
-        for i in $(seq 1 60); do
+        for _ in $(seq 1 60); do
             [ -S "$SOCK" ] && break
             sleep 1
         done
@@ -49,10 +55,13 @@ case "${1:-}" in
         fi
         ;;
     transcribe)
+        if [ -S "$SOCK" ] && [ -f "$PIDFILE" ] && ! kill -0 "$(cat "$PIDFILE")" 2>/dev/null; then
+            rm -f "$PIDFILE" "$SOCK"
+        fi
         if [ ! -S "$SOCK" ]; then
             "$0" start >&2 || exit 1
         fi
-        echo "$2" | socat - UNIX-CONNECT:"$SOCK"
+        echo "$2" | socat -T 30 - UNIX-CONNECT:"$SOCK"
         ;;
     *)
         echo "Usage: parakeet-server {start|stop|transcribe <audio.wav>}" >&2

diff --git a/talktype b/talktype
@@ -9,7 +9,7 @@
 # Transcription is pluggable: set TALKTYPE_CMD to any command that
 # takes a WAV file path as its last argument and prints text to stdout.
 #
-# Requires: ydotool, pw-record (PipeWire)
+# Requires: wtype/ydotool/xdotool, ffmpeg/pw-record
 #
 set -euo pipefail
 
@@ -58,10 +58,61 @@ notify_close() {
     fi
 }
 
+# ── Typing tool selection ──
+warn_ydotool_no_daemon() {
+    local warnfile="$TALKTYPE_DIR/.ydotool-warned"
+    [ -f "$warnfile" ] && return
+    touch "$warnfile"
+    echo "Warning: ydotool without ydotoold leaks virtual input devices (see issue #7). Install wtype (Wayland) or run ydotoold." >&2
+    notify-send -t 5000 -i dialog-warning "TalkType" "ydotool without daemon — may leak input devices" 2>/dev/null || true
+}
+
+type_text() {
+    local text="$1"
+    local cmd="${TALKTYPE_TYPE_CMD:-auto}"
+
+    # Explicit tool — use it directly
+    if [ "$cmd" != "auto" ]; then
+        if [ "$cmd" = "ydotool" ] && ! pgrep -x ydotoold &>/dev/null; then
+            warn_ydotool_no_daemon
+        fi
+        case "$cmd" in
+            wtype)    wtype -- "$text" ;;
+            ydotool)  ydotool type --key-delay 20 -- "$text" ;;
+            xdotool)  xdotool type -- "$text" ;;
+            *)        $cmd "$text" ;;
+        esac
+        return
+    fi
+
+    # Auto-detect: try each safe tool, fall through on failure
+    # (wtype may be installed but unsupported by the compositor)
+    if [ -n "${WAYLAND_DISPLAY:-}" ] && command -v wtype &>/dev/null; then
+        wtype -- "$text" 2>/dev/null && return
+    fi
+    if command -v ydotool &>/dev/null && pgrep -x ydotoold &>/dev/null; then
+        ydotool type --key-delay 20 -- "$text" && return
+    fi
+    if [ -n "${DISPLAY:-}" ] && [ -z "${WAYLAND_DISPLAY:-}" ] && command -v xdotool &>/dev/null; then
+        xdotool type -- "$text" && return
+    fi
+    if command -v ydotool &>/dev/null; then
+        warn_ydotool_no_daemon
+        ydotool type --key-delay 20 -- "$text" && return
+    fi
+    echo "Error: no typing tool found (install wtype, ydotool, or xdotool)" >&2
+    return 1
+}
+
 # ── Check core dependencies ──
 check_deps() {
     local missing=()
-    command -v ydotool    &>/dev/null || missing+=(ydotool)
+    local type_cmd="${TALKTYPE_TYPE_CMD:-auto}"
+    if [ "$type_cmd" = "auto" ]; then
+        command -v wtype &>/dev/null || command -v ydotool &>/dev/null || command -v xdotool &>/dev/null || missing+=("wtype, ydotool, or xdotool")
+    else
+        command -v "$type_cmd" &>/dev/null || missing+=("$type_cmd")
+    fi
     command -v ffmpeg &>/dev/null || command -v pw-record &>/dev/null || missing+=("ffmpeg or pipewire")
     command -v notify-send &>/dev/null || missing+=(libnotify-bin)
 
@@ -96,8 +147,8 @@ if [ -f "$PIDFILE" ]; then
 
     notify_close
 
-    # Type text at cursor via ydotool
-    ydotool type --key-delay 20 -- "$TEXT"
+    # Type text at cursor
+    type_text "$TEXT"
 
 # ── Otherwise → start recording ──
 else

diff --git a/test/mocks/pgrep b/test/mocks/pgrep
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+# Mock pgrep: always report no matching process (exit 1)
+# Override MOCK_PGREP_EXIT=0 in tests that need ydotoold detection
+exit "${MOCK_PGREP_EXIT:-1}"
diff --git a/test/mocks/wtype b/test/mocks/wtype
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+# Mock wtype: log the command and args
+echo "$@" >> "$TALKTYPE_DIR/wtype.log"
diff --git a/test/mocks/xdotool b/test/mocks/xdotool
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+# Mock xdotool: log the command and args
+echo "$@" >> "$TALKTYPE_DIR/xdotool.log"