diff --git a/.github/workflows/clips-desktop-build-check.yml b/.github/workflows/clips-desktop-build-check.yml index 02043567ff..c640a99748 100644 --- a/.github/workflows/clips-desktop-build-check.yml +++ b/.github/workflows/clips-desktop-build-check.yml @@ -58,6 +58,20 @@ jobs: - name: Install dependencies run: pnpm install --frozen-lockfile + # whisper-rs compiles whisper.cpp (CMake + C/C++) and runs bindgen, which + # needs libclang. windows-latest ships LLVM and CMake preinstalled; point + # bindgen at libclang and fall back to a Chocolatey install if missing. + - name: Set up libclang for whisper-rs bindgen (Windows) + if: runner.os == 'Windows' + shell: bash + run: | + if [ -f "/c/Program Files/LLVM/bin/libclang.dll" ]; then + echo "LIBCLANG_PATH=C:\\Program Files\\LLVM\\bin" >> "$GITHUB_ENV" + else + choco install llvm --no-progress -y + echo "LIBCLANG_PATH=C:\\Program Files\\LLVM\\bin" >> "$GITHUB_ENV" + fi + - name: Build frontend working-directory: ${{ env.TAURI_APP_PATH }} run: pnpm run build diff --git a/.github/workflows/clips-desktop-release.yml b/.github/workflows/clips-desktop-release.yml index 9db6d057b3..59cb507ce1 100644 --- a/.github/workflows/clips-desktop-release.yml +++ b/.github/workflows/clips-desktop-release.yml @@ -123,6 +123,21 @@ jobs: V=$(node -p "require('./${{ env.TAURI_APP_PATH }}/package.json').version") echo "version=$V" >> "$GITHUB_OUTPUT" + # whisper-rs compiles whisper.cpp (CMake + C/C++) and runs bindgen, which + # needs libclang. windows-latest ships LLVM and CMake preinstalled; point + # bindgen at libclang and fall back to a Chocolatey install if the image + # ever drops it. macOS already has clang via Xcode. + - name: Set up libclang for whisper-rs bindgen (Windows) + if: runner.os == 'Windows' + shell: bash + run: | + if [ -f "/c/Program Files/LLVM/bin/libclang.dll" ]; then + echo "LIBCLANG_PATH=C:\\Program Files\\LLVM\\bin" >> "$GITHUB_ENV" + else + choco install llvm --no-progress -y + echo "LIBCLANG_PATH=C:\\Program Files\\LLVM\\bin" >> "$GITHUB_ENV" + fi + - name: Build and release (tauri-action) uses: tauri-apps/tauri-action@84b9d35b5fc46c1e45415bdb6144030364f7ebc5 # v0 env: diff --git a/templates/clips/desktop/README.md b/templates/clips/desktop/README.md index 5665c1cb37..b8a73d3c03 100644 --- a/templates/clips/desktop/README.md +++ b/templates/clips/desktop/README.md @@ -6,6 +6,37 @@ A small Tauri 2.x menu-bar app that lives in the macOS menu bar / Windows system - **Recent** — your three most recent recordings - Quick links to **Open library** and **Settings** +## Meeting transcription (macOS + Windows) + +When the calendar watcher detects a meeting, the popover surfaces a "Start +notes" notification. Accepting it captures **both** the microphone and the +system (speaker) audio, transcribes them locally with Whisper (whisper.cpp, no +cloud round-trip), and streams a live mic/system-labeled transcript. + +Audio capture is platform-dispatched in `src-tauri/src/capture/`: + +- **macOS** — AVAudioEngine with VoiceProcessingIO acoustic echo cancellation + (mic) and ScreenCaptureKit (system audio). +- **Windows** — [`cpal`](https://crates.io/crates/cpal): the default WASAPI + input device (mic) and WASAPI **loopback** of the default output device + (system audio). Loopback needs no OS permission prompt. + +The Whisper engine (`src-tauri/src/whisper_speech.rs`) and the meeting +detection, notification, and transcript-rendering flows are otherwise identical +across platforms. + +### Known Windows limitations (v1) + +- **No mic echo cancellation.** macOS applies hardware AEC so the mic stream + doesn't echo the system audio. `cpal` delivers the raw mic, so expect some + speaker bleed into the mic transcript when not on headphones. Mic and system + are transcribed and labeled separately, so this is cosmetic. +- **No sleep / call-ended auto-stop.** The macOS sleep and call-ended watchers + are no-ops on Windows; the silence-based auto-stop still works. Stop notes + manually or via the silence timeout. +- macOS-only features stay macOS-only: screen/window video recording, EventKit + local calendar, and Accessibility-based personal-vocabulary auto-learn. + ## Develop First install the desktop workspace's own deps (this folder is outside the monorepo's `templates/*` glob because it ships its own Tauri/Vite toolchain): diff --git a/templates/clips/desktop/src-tauri/Cargo.toml b/templates/clips/desktop/src-tauri/Cargo.toml index 2b04b12e54..f9cf897784 100644 --- a/templates/clips/desktop/src-tauri/Cargo.toml +++ b/templates/clips/desktop/src-tauri/Cargo.toml @@ -112,13 +112,27 @@ screencapturekit = { version = "2.0.0", features = ["macos_15_0"] } # AudioBufferList type so we can hand SCK's CMSampleBuffer audio bytes to # AVAudioPCMBuffer for the speech recognizer. objc2-core-audio-types = "0.3" -whisper-rs = { version = "0.16.0" } # NOTE: We hand-roll the few EventKit selectors we need via raw `objc2` # msg_send! in `eventkit.rs` rather than depending on the `objc2-event-kit` # crate. Its API surface drifts between minor 0.3.x releases (and not # every version exposes `EKEventStore::eventsMatchingPredicate`); the # slice we need is small enough that hand-rolling is more durable. +# Windows-only: cross-platform audio capture. Covers both the microphone +# (default WASAPI input) and system audio (WASAPI loopback — an input stream on +# the default output device). One crate for both meeting capture streams. +# NOTE: pin verified against crates.io before release; network was unavailable +# in the dev environment where this was added. +[target."cfg(target_os = \"windows\")".dependencies] +cpal = "0.15" + +# Local meeting transcription via whisper.cpp. Cross-platform (bundles its own +# native C/C++ build via CMake), needed only on the two platforms that run the +# Whisper meeting engine. Scoped here rather than the macOS-only block (it is +# not macOS-specific) but kept off Linux, which has no meeting capture. +[target."cfg(any(target_os = \"macos\", target_os = \"windows\"))".dependencies] +whisper-rs = { version = "0.16.0" } + [profile.release] panic = "abort" codegen-units = 1 diff --git a/templates/clips/desktop/src-tauri/src/capture/macos.rs b/templates/clips/desktop/src-tauri/src/capture/macos.rs new file mode 100644 index 0000000000..95e907e000 --- /dev/null +++ b/templates/clips/desktop/src-tauri/src/capture/macos.rs @@ -0,0 +1,10 @@ +//! macOS capture backend — thin re-exports of the existing native impls. +//! +//! The high-quality macOS path is left entirely untouched: the microphone runs +//! through `native_speech` (AVAudioEngine + VoiceProcessingIO AEC) and the +//! system audio through `system_audio` (ScreenCaptureKit). This file only +//! surfaces them under the platform-agnostic names the `capture` contract and +//! `whisper_speech.rs` expect. + +pub(crate) use crate::native_speech::macos::{start_raw_mic_capture, RawMicCapture}; +pub(crate) use crate::system_audio::macos::{start_raw_system_capture, RawSystemCapture}; diff --git a/templates/clips/desktop/src-tauri/src/capture/mod.rs b/templates/clips/desktop/src-tauri/src/capture/mod.rs new file mode 100644 index 0000000000..d3382303d0 --- /dev/null +++ b/templates/clips/desktop/src-tauri/src/capture/mod.rs @@ -0,0 +1,39 @@ +//! Platform-dispatched audio capture for meeting transcription. +//! +//! The local Whisper engine (`whisper_speech.rs`) needs exactly two capture +//! primitives — a microphone stream and a system-audio (loopback) stream — +//! each forwarding mono `f32` samples to a callback and exposing the hardware +//! sample rate plus a `stop()`. Everything else in the meeting pipeline +//! (detection, notifications, transcript rendering) is already cross-platform. +//! +//! This module owns that public contract and dispatches to the right backend +//! at compile time: +//! +//! - macOS → thin re-exports of the proven `native_speech` (AVAudioEngine + +//! VPIO AEC) mic path and `system_audio` (ScreenCaptureKit) loopback. +//! - Windows → `cpal` mic + WASAPI loopback (see `windows.rs`). +//! +//! Both backends expose the same names so `whisper_speech.rs` stays +//! platform-agnostic: +//! +//! ```ignore +//! start_raw_mic_capture(app, mic_device_id, mic_device_label, on_samples) -> RawMicCapture +//! start_raw_system_capture(app, on_samples) -> RawSystemCapture +//! ``` +//! +//! Each handle type exposes `sample_rate() -> f64` and `stop()` so the session +//! teardown in `whisper_speech.rs` works unchanged across platforms. + +#[cfg(target_os = "macos")] +mod macos; +#[cfg(target_os = "macos")] +pub(crate) use macos::{ + start_raw_mic_capture, start_raw_system_capture, RawMicCapture, RawSystemCapture, +}; + +#[cfg(target_os = "windows")] +mod windows; +#[cfg(target_os = "windows")] +pub(crate) use windows::{ + start_raw_mic_capture, start_raw_system_capture, RawMicCapture, RawSystemCapture, +}; diff --git a/templates/clips/desktop/src-tauri/src/capture/windows.rs b/templates/clips/desktop/src-tauri/src/capture/windows.rs new file mode 100644 index 0000000000..552154df3b --- /dev/null +++ b/templates/clips/desktop/src-tauri/src/capture/windows.rs @@ -0,0 +1,333 @@ +//! Windows capture backend — `cpal` microphone + WASAPI loopback system audio. +//! +//! `cpal` is the single cross-platform crate that covers both primitives the +//! Whisper engine needs on Windows: +//! +//! - **Microphone:** the default (or label-matched) WASAPI input device. +//! - **System audio:** WASAPI *loopback* — `cpal` builds an input stream on +//! the default *output* device, which sets `AUDCLNT_STREAMFLAGS_LOOPBACK` +//! and captures whatever the speakers are playing. No OS permission prompt. +//! +//! Both paths down-mix the device's native interleaved format (any +//! `SampleFormat`, any channel count) to mono `f32`, forward it to +//! `on_samples`, and emit `voice:audio-level` on the same cadence as the macOS +//! backend so the silence detector and waveform UI behave identically. The real +//! device sample rate is reported via `sample_rate()`; `whisper_speech.rs` +//! resamples to 16 kHz from there. +//! +//! ## Known limitation (v1): no mic AEC +//! +//! macOS applies hardware acoustic echo cancellation (VoiceProcessingIO) so the +//! mic stream doesn't echo the system audio. `cpal` delivers the raw mic with +//! no AEC, so expect some speaker bleed into the mic transcript when the user +//! is not on headphones. Acceptable for v1 — mic and system are transcribed and +//! labeled separately. No software AEC is added here. + +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::Arc; + +use cpal::traits::{DeviceTrait, HostTrait}; +use cpal::{FromSample, SampleFormat, SizedSample, Stream, StreamConfig}; +use serde::Serialize; +use tauri::{AppHandle, Emitter}; + +/// Mirrors the macOS `voice:audio-level` payload so the waveform meter and the +/// silence detector consume an identical event shape on every platform. +#[derive(Serialize, Clone)] +struct AudioLevelPayload { + level: f32, + source: &'static str, +} + +// ---- public handles --------------------------------------------------------- + +/// Handle for a running cpal microphone capture. Dropping the inner `Stream` +/// (via `stop()`) tears down the WASAPI client and its callback thread. +pub(crate) struct RawMicCapture { + stream: Stream, + sample_rate: f64, +} + +// SAFETY: cpal's WASAPI `Stream` holds COM interface pointers and is `!Send`. +// We never call methods on it from another thread — the only cross-thread +// operation is moving the handle into the session `Mutex` and later dropping +// it, which signals the capture thread to stop. This mirrors the macOS handles' +// `unsafe impl Send`. +unsafe impl Send for RawMicCapture {} + +impl RawMicCapture { + /// Hardware sample rate of the mic stream (e.g. 48000) — the Whisper engine + /// resamples to 16 kHz from this. + pub(crate) fn sample_rate(&self) -> f64 { + self.sample_rate + } + + pub(crate) fn stop(self) { + drop(self.stream); + } +} + +/// Handle for a running WASAPI loopback (system audio) capture. +pub(crate) struct RawSystemCapture { + stream: Stream, + #[allow(dead_code)] + sample_rate: f64, +} + +// SAFETY: same argument as `RawMicCapture`. +unsafe impl Send for RawSystemCapture {} + +impl RawSystemCapture { + pub(crate) fn stop(self) { + drop(self.stream); + } +} + +// ---- sample conversion ------------------------------------------------------ + +/// Down-mix an interleaved buffer of any sample format / channel count to mono +/// `f32`. cpal may deliver `i16`/`u16`/`f32`/… and 1–N channels; we average all +/// channels per frame so neither stereo music nor multi-channel mixes are +/// truncated to one side. +fn interleaved_to_mono_f32(data: &[T], channels: usize) -> Vec +where + T: Copy, + f32: FromSample, +{ + if channels <= 1 { + return data.iter().map(|&s| f32::from_sample(s)).collect(); + } + let frames = data.len() / channels; + let mut out = Vec::with_capacity(frames); + for frame in 0..frames { + let base = frame * channels; + let mut acc = 0.0f32; + for c in 0..channels { + acc += f32::from_sample(data[base + c]); + } + out.push(acc / channels as f32); + } + out +} + +/// Peak absolute amplitude of a mono buffer — the level the waveform meter and +/// silence detector expect. +fn peak_level(mono: &[f32]) -> f32 { + mono.iter().fold(0.0f32, |m, &s| m.max(s.abs())) +} + +// ---- stream construction ---------------------------------------------------- + +/// Build a cpal input stream for sample type `T`, converting each callback +/// buffer to mono `f32` and forwarding it to `mono_cb`. +fn build_stream( + device: &cpal::Device, + config: &StreamConfig, + mono_cb: impl Fn(&[f32]) + Send + 'static, +) -> Result +where + T: SizedSample, + f32: FromSample, +{ + let channels = config.channels as usize; + device + .build_input_stream( + config, + move |data: &[T], _: &cpal::InputCallbackInfo| { + let mono = interleaved_to_mono_f32(data, channels); + mono_cb(&mono); + }, + |err| eprintln!("[capture-win] stream error: {err}"), + None, + ) + .map_err(|e| format!("build_input_stream failed: {e}")) +} + +/// Construct the per-buffer callback that forwards mono samples to the Whisper +/// engine and emits `voice:audio-level` on `level_every`-th buffer. +fn make_callback( + app: AppHandle, + source: &'static str, + level_every: u32, + on_samples: Arc, +) -> impl Fn(&[f32]) + Send + 'static { + let tick = AtomicU32::new(0); + move |mono: &[f32]| { + on_samples(mono); + let n = tick.fetch_add(1, Ordering::Relaxed); + if level_every > 0 && n % level_every == 0 { + let _ = app.emit( + "voice:audio-level", + AudioLevelPayload { + level: peak_level(mono), + source, + }, + ); + } + } +} + +/// Dispatch stream construction over cpal's runtime `SampleFormat`. Each match +/// arm moves `mono_cb` — allowed because only one arm executes. +fn build_for_format( + device: &cpal::Device, + config: &StreamConfig, + sample_format: SampleFormat, + mono_cb: impl Fn(&[f32]) + Send + 'static, +) -> Result { + match sample_format { + SampleFormat::F32 => build_stream::(device, config, mono_cb), + SampleFormat::F64 => build_stream::(device, config, mono_cb), + SampleFormat::I8 => build_stream::(device, config, mono_cb), + SampleFormat::I16 => build_stream::(device, config, mono_cb), + SampleFormat::I32 => build_stream::(device, config, mono_cb), + SampleFormat::I64 => build_stream::(device, config, mono_cb), + SampleFormat::U8 => build_stream::(device, config, mono_cb), + SampleFormat::U16 => build_stream::(device, config, mono_cb), + SampleFormat::U32 => build_stream::(device, config, mono_cb), + SampleFormat::U64 => build_stream::(device, config, mono_cb), + other => Err(format!("unsupported sample format: {other:?}")), + } +} + +// ---- device selection ------------------------------------------------------- + +/// Resolve the microphone device. The renderer passes a web +/// `enumerateDevices()` id/label, which does not map to cpal's device names, so +/// we best-effort match the label against cpal's device names (case-insensitive +/// substring) and fall back to the system default input. +fn resolve_input_device( + host: &cpal::Host, + mic_device_label: Option<&str>, +) -> Result { + if let Some(label) = mic_device_label.map(str::trim).filter(|l| !l.is_empty()) { + let needle = label.to_ascii_lowercase(); + if let Ok(devices) = host.input_devices() { + for device in devices { + if let Ok(name) = device.name() { + if name.to_ascii_lowercase().contains(&needle) { + return Ok(device); + } + } + } + } + } + host.default_input_device() + .ok_or_else(|| "no default microphone device available".to_string()) +} + +// ---- public entry points ---------------------------------------------------- + +/// Start microphone capture and forward every mono `f32` buffer to +/// `on_samples`. `mic_device_id` is currently unused on Windows (the web device +/// id has no cpal equivalent); selection is by `mic_device_label`. +pub(crate) fn start_raw_mic_capture( + app: AppHandle, + mic_device_id: Option, + mic_device_label: Option, + on_samples: Arc, +) -> Result { + let _ = mic_device_id; + let host = cpal::default_host(); + let device = resolve_input_device(&host, mic_device_label.as_deref())?; + let supported = device + .default_input_config() + .map_err(|e| format!("default_input_config failed: {e}"))?; + let sample_format = supported.sample_format(); + let sample_rate = supported.sample_rate().0 as f64; + let config: StreamConfig = supported.into(); + + let cb = make_callback(app.clone(), "mic", 2, on_samples); + let stream = build_for_format(&device, &config, sample_format, cb)?; + use cpal::traits::StreamTrait; + stream + .play() + .map_err(|e| format!("mic stream play failed: {e}"))?; + + eprintln!( + "[capture-win] mic capture started: {} Hz, {} ch, {:?}", + sample_rate as u32, config.channels, sample_format + ); + Ok(RawMicCapture { + stream, + sample_rate, + }) +} + +/// Start system-audio capture via WASAPI loopback. cpal builds an *input* +/// stream on the default *output* device, which captures the speaker mix. No OS +/// permission prompt is required for loopback. +pub(crate) fn start_raw_system_capture( + app: AppHandle, + on_samples: Arc, +) -> Result { + let host = cpal::default_host(); + let device = host + .default_output_device() + .ok_or_else(|| "no default output device available for loopback".to_string())?; + // The loopback stream uses the output device's native render format. + let supported = device + .default_output_config() + .map_err(|e| format!("default_output_config failed: {e}"))?; + let sample_format = supported.sample_format(); + let sample_rate = supported.sample_rate().0 as f64; + let config: StreamConfig = supported.into(); + + let cb = make_callback(app.clone(), "system", 3, on_samples); + let stream = build_for_format(&device, &config, sample_format, cb)?; + use cpal::traits::StreamTrait; + stream + .play() + .map_err(|e| format!("loopback stream play failed: {e}"))?; + + eprintln!( + "[capture-win] system loopback started: {} Hz, {} ch, {:?}", + sample_rate as u32, config.channels, sample_format + ); + Ok(RawSystemCapture { + stream, + sample_rate, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn mono_passthrough() { + let data: [f32; 3] = [0.1, -0.2, 0.3]; + let out = interleaved_to_mono_f32(&data, 1); + assert_eq!(out, vec![0.1, -0.2, 0.3]); + } + + #[test] + fn stereo_f32_averaged_to_mono() { + // L/R interleaved: (1.0,-1.0) -> 0.0 ; (0.5,0.5) -> 0.5 + let data: [f32; 4] = [1.0, -1.0, 0.5, 0.5]; + let out = interleaved_to_mono_f32(&data, 2); + assert_eq!(out, vec![0.0, 0.5]); + } + + #[test] + fn stereo_i16_converted_and_averaged() { + // i16::MAX in both channels -> ~1.0 mono; opposite extremes -> ~0.0 + let data: [i16; 4] = [i16::MAX, i16::MAX, i16::MAX, i16::MIN]; + let out = interleaved_to_mono_f32(&data, 2); + assert!((out[0] - 1.0).abs() < 1e-3, "got {}", out[0]); + assert!(out[1].abs() < 1e-3, "got {}", out[1]); + } + + #[test] + fn four_channel_averaged() { + let data: [f32; 4] = [1.0, 1.0, 1.0, 1.0]; + let out = interleaved_to_mono_f32(&data, 4); + assert_eq!(out, vec![1.0]); + } + + #[test] + fn peak_level_picks_max_abs() { + assert_eq!(peak_level(&[0.1, -0.9, 0.4]), 0.9); + assert_eq!(peak_level(&[]), 0.0); + } +} diff --git a/templates/clips/desktop/src-tauri/src/lib.rs b/templates/clips/desktop/src-tauri/src/lib.rs index 493e2ccf1e..a3055184ff 100644 --- a/templates/clips/desktop/src-tauri/src/lib.rs +++ b/templates/clips/desktop/src-tauri/src/lib.rs @@ -5,6 +5,7 @@ //! is served by the Vite-built React UI (see `../dist`). mod accessibility; +mod capture; mod clips; mod config; mod debug; @@ -202,8 +203,9 @@ pub fn run() { // Pre-download the Whisper model in the background so the first // meeting doesn't pay the ~142 MB download cost mid-call. Skipped - // when the user has disabled the model in Settings. - #[cfg(target_os = "macos")] + // when the user has disabled the model in Settings. Runs on both + // macOS and Windows (the two platforms with meeting transcription). + #[cfg(any(target_os = "macos", target_os = "windows"))] { let cfg = config::feature_config(app.handle()); if cfg.whisper_model_enabled diff --git a/templates/clips/desktop/src-tauri/src/system_audio.rs b/templates/clips/desktop/src-tauri/src/system_audio.rs index 6295402ff4..45713c70ce 100644 --- a/templates/clips/desktop/src-tauri/src/system_audio.rs +++ b/templates/clips/desktop/src-tauri/src/system_audio.rs @@ -58,12 +58,22 @@ pub fn system_audio_version_status() -> VersionStatus { { macos::version_status() } - #[cfg(not(target_os = "macos"))] + #[cfg(target_os = "windows")] + { + // WASAPI loopback is available on all supported Windows versions and + // needs no special OS capability check. + VersionStatus { + supported: true, + os_version: std::env::consts::OS.to_string(), + reason: None, + } + } + #[cfg(not(any(target_os = "macos", target_os = "windows")))] { VersionStatus { supported: false, os_version: std::env::consts::OS.to_string(), - reason: Some("System audio capture is only supported on macOS.".into()), + reason: Some("System audio capture is only supported on macOS and Windows.".into()), } } } @@ -85,9 +95,16 @@ pub async fn system_audio_request_permission() -> Result { } macos::request_screen_capture_access().await } - #[cfg(not(target_os = "macos"))] + #[cfg(target_os = "windows")] + { + // WASAPI loopback requires no permission grant; the mic permission is + // handled by the OS at first capture. Report success so the renderer + // proceeds straight to capture. + Ok(true) + } + #[cfg(not(any(target_os = "macos", target_os = "windows")))] { - Err("System audio capture is only supported on macOS.".into()) + Err("System audio capture is only supported on macOS and Windows.".into()) } } diff --git a/templates/clips/desktop/src-tauri/src/whisper_speech.rs b/templates/clips/desktop/src-tauri/src/whisper_speech.rs index bba6644c7e..5c3d1869bf 100644 --- a/templates/clips/desktop/src-tauri/src/whisper_speech.rs +++ b/templates/clips/desktop/src-tauri/src/whisper_speech.rs @@ -7,10 +7,10 @@ //! by `source`. whisper.cpp has no such limit: we run one whisper context with //! a per-stream worker thread, fully offline. //! -//! Capture is reused from the existing modules: -//! - mic → `native_speech::macos::start_raw_mic_capture` (AVAudioEngine + -//! VoiceProcessingIO AEC, other-audio ducking off) -//! - system → `system_audio::macos::start_raw_system_capture` (ScreenCaptureKit) +//! Capture is delegated to the platform-dispatched `crate::capture` module: +//! - macOS → AVAudioEngine + VoiceProcessingIO AEC (mic) and +//! ScreenCaptureKit (system audio). +//! - Windows → `cpal` microphone + WASAPI loopback (system audio). //! use tauri::AppHandle; @@ -24,33 +24,33 @@ pub async fn meeting_whisper_start( if !crate::config::feature_config(&app).whisper_model_enabled { return Err("whisper-model-disabled".into()); } - #[cfg(target_os = "macos")] + #[cfg(any(target_os = "macos", target_os = "windows"))] { - macos::start(app, language, mic_device_id, mic_device_label).await + engine::start(app, language, mic_device_id, mic_device_label).await } - #[cfg(not(target_os = "macos"))] + #[cfg(not(any(target_os = "macos", target_os = "windows")))] { let _ = (app, language, mic_device_id, mic_device_label); - Err("Whisper meeting transcription is only supported on macOS.".into()) + Err("Whisper meeting transcription is not supported on this platform.".into()) } } #[tauri::command] pub async fn meeting_whisper_stop(app: AppHandle) -> Result<(), String> { - #[cfg(target_os = "macos")] + #[cfg(any(target_os = "macos", target_os = "windows"))] { - macos::stop(&app); + engine::stop(&app); Ok(()) } - #[cfg(not(target_os = "macos"))] + #[cfg(not(any(target_os = "macos", target_os = "windows")))] { let _ = app; Ok(()) } } -#[cfg(target_os = "macos")] -mod macos { +#[cfg(any(target_os = "macos", target_os = "windows"))] +mod engine { use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; use std::sync::{Arc, Mutex, OnceLock}; use std::time::{Duration, Instant}; @@ -59,8 +59,9 @@ mod macos { use tauri::{AppHandle, Emitter}; use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters}; - use crate::native_speech::macos::{start_raw_mic_capture, RawMicCapture}; - use crate::system_audio::macos::{start_raw_system_capture, RawSystemCapture}; + use crate::capture::{ + start_raw_mic_capture, start_raw_system_capture, RawMicCapture, RawSystemCapture, + }; use crate::whisper_model::{custom_model_override, ensure_model, model_file}; /// One transcript segment with real timestamps from whisper, already @@ -456,9 +457,10 @@ mod macos { sys: Arc, } - // SAFETY: the capture handles hold refcounted ObjC objects (already - // `Send`); the streams are `Arc` over `Send + Sync` interiors. We only move - // the session through the `Mutex`, never alias across threads. + // SAFETY: the capture handles wrap platform audio resources that are each + // `unsafe impl Send` (refcounted ObjC objects on macOS; cpal stream handles + // on Windows); the streams are `Arc` over `Send + Sync` interiors. We only + // move the session through the `Mutex`, never alias across threads. unsafe impl Send for Session {} fn session_slot() -> &'static Mutex> {