Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
38628c6
Add voice channel adapter with WebSocket server
Mar 23, 2026
123694a
Add per-channel system prompt injection via function parameters
Mar 27, 2026
4dd628b
Fix cargo fmt formatting
Mar 28, 2026
8bc3077
Add voice pipeline: PCM mode, Smart Turn ONNX, STT/TTS providers, web…
Apr 1, 2026
3a96562
Fix Smart Turn: catch ort panic when libonnxruntime.so is missing
Apr 2, 2026
859f588
Fix Smart Turn startup hang: spawn_blocking + timeout fallback
Apr 2, 2026
7ec26a4
Route voice WebSocket through main API port; fix UI button icons
Apr 2, 2026
def1f12
Fix voice UI: move call button to header, add Permissions-Policy for mic
Apr 3, 2026
45339b0
Add voice identity handshake and optional speaker diarization
Apr 3, 2026
7216283
Add voice barge-in: interrupt agent speech on loud mic input
Apr 3, 2026
c0d0acf
Fix CSP: allow blob: in script-src for AudioWorklet
Apr 3, 2026
cd2f807
Fix CSP violations: blob: for AudioWorklet, nonce inline script, fix …
Apr 3, 2026
9c12be2
Fix silence fallback: use sleep_until(absolute) instead of sleep(dura…
Apr 3, 2026
c09f683
Fix build: restore rmcp-based mcp.rs, fix non-exhaustive struct ctor
Apr 4, 2026
d3174f3
Fix PCM mode: Hello frame before binary was routing sessions to text …
Apr 3, 2026
106fae9
Fix voice: spurious dispatch, multiple sessions, missing transcript
Apr 3, 2026
c12c3bf
Fix voice transcript: show clean text in chat, use Alpine-safe mutations
Apr 3, 2026
a274b88
Fix voice: discard stale responses when user speaks during thinking
Apr 3, 2026
7b52150
Voice: inject discarded response as context when user interrupts thin…
Apr 3, 2026
c435d0e
cargo fmt
Apr 4, 2026
02c1456
Fix voice config: use api_key_env instead of literal api_key
Apr 4, 2026
79fe580
Voice UI: gate phone button on server STT+TTS capability
Apr 4, 2026
23641cc
Remove personal name from doc comment example
Apr 4, 2026
5eaabdd
Fix Clippy warnings in voice and tts
Apr 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
540 changes: 350 additions & 190 deletions Cargo.lock

Large diffs are not rendered by default.

10 changes: 9 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# syntax=docker/dockerfile:1
FROM rust:1-slim-bookworm AS builder
WORKDIR /build
RUN apt-get update && apt-get install -y pkg-config libssl-dev && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y pkg-config libssl-dev perl make && rm -rf /var/lib/apt/lists/*
COPY Cargo.toml Cargo.lock ./
COPY crates ./crates
COPY xtask ./xtask
Expand All @@ -16,6 +16,14 @@ ENV CARGO_PROFILE_RELEASE_LTO=${LTO} \
RUN cargo build --release --bin openfang

FROM rust:1-slim-bookworm
# Install ONNX Runtime shared library for Smart Turn inference
ARG ORT_VERSION=1.20.1
ADD https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-${ORT_VERSION}.tgz /tmp/ort.tgz
RUN tar -xzf /tmp/ort.tgz -C /tmp && \
cp /tmp/onnxruntime-linux-x64-${ORT_VERSION}/lib/libonnxruntime.so.${ORT_VERSION} /usr/lib/ && \
ln -s /usr/lib/libonnxruntime.so.${ORT_VERSION} /usr/lib/libonnxruntime.so && \
ldconfig && \
rm -rf /tmp/ort.tgz /tmp/onnxruntime-linux-x64-${ORT_VERSION}
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
python3 \
Expand Down
33 changes: 33 additions & 0 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Fast dev image: binary built on host, just packaged here.
# Usage:
# cargo build --release --bin openfang # ~10s incremental
# docker build -t openfang-gw:latest -f Dockerfile.dev .
# docker compose up -d openfang
#
# First host build: ~3 min (same as Docker). Subsequent: ~10s.
# Requires the host target triple to match the container (linux/amd64).

FROM rust:1-slim-bookworm

# ONNX Runtime for Smart Turn
ARG ORT_VERSION=1.20.1
ADD https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/onnxruntime-linux-x64-${ORT_VERSION}.tgz /tmp/ort.tgz
RUN tar -xzf /tmp/ort.tgz -C /tmp && \
cp /tmp/onnxruntime-linux-x64-${ORT_VERSION}/lib/libonnxruntime.so.${ORT_VERSION} /usr/lib/ && \
ln -s /usr/lib/libonnxruntime.so.${ORT_VERSION} /usr/lib/libonnxruntime.so && \
ldconfig && \
rm -rf /tmp/ort.tgz /tmp/onnxruntime-linux-x64-${ORT_VERSION}

RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates python3 python3-pip python3-venv nodejs npm \
&& rm -rf /var/lib/apt/lists/*

# Copy pre-built binary from host
COPY target/release/openfang /usr/local/bin/
COPY agents /opt/openfang/agents

EXPOSE 4200
VOLUME /data
ENV OPENFANG_HOME=/data
ENTRYPOINT ["openfang"]
CMD ["start"]
6 changes: 6 additions & 0 deletions build-dev.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/usr/bin/env bash
# Fast dev build: compile on host (incremental), package into image.
set -e
cd "$(dirname "$0")"
cargo build --release --bin openfang
docker build -t openfang-gw:latest -f Dockerfile.dev .
106 changes: 96 additions & 10 deletions crates/openfang-api/src/channel_bridge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use openfang_channels::teams::TeamsAdapter;
use openfang_channels::telegram::TelegramAdapter;
use openfang_channels::twitch::TwitchAdapter;
use openfang_channels::types::ChannelAdapter;
use openfang_channels::voice::VoiceAdapter;
use openfang_channels::whatsapp::WhatsAppAdapter;
use openfang_channels::xmpp::XmppAdapter;
use openfang_channels::zulip::ZulipAdapter;
Expand All @@ -43,6 +44,7 @@ use openfang_channels::twist::TwistAdapter;
use openfang_channels::webex::WebexAdapter;
// Wave 5
use async_trait::async_trait;
use axum::Router;
use openfang_channels::dingtalk::DingTalkAdapter;
use openfang_channels::dingtalk_stream::DingTalkStreamAdapter;
use openfang_channels::discourse::DiscourseAdapter;
Expand Down Expand Up @@ -110,6 +112,23 @@ impl ChannelBridgeHandle for KernelBridgeAdapter {
Ok(result.response)
}

async fn send_message_with_channel_prompt(
&self,
agent_id: AgentId,
message: &str,
channel_prompt: &str,
) -> Result<String, String> {
let result = self
.kernel
.send_message_with_channel_prompt(agent_id, message, channel_prompt)
.await
.map_err(|e| format!("{e}"))?;
if result.silent {
return Ok(String::new());
}
Ok(result.response)
}

async fn find_agent_by_name(&self, name: &str) -> Result<Option<AgentId>, String> {
Ok(self.kernel.registry.find_by_name(name).map(|e| e.id))
}
Expand Down Expand Up @@ -774,6 +793,7 @@ impl ChannelBridgeHandle for KernelBridgeAdapter {
"mattermost" => channels.mattermost.as_ref().map(|c| c.overrides.clone()),
"irc" => channels.irc.as_ref().map(|c| c.overrides.clone()),
"google_chat" => channels.google_chat.as_ref().map(|c| c.overrides.clone()),
"voice" => channels.voice.as_ref().map(|c| c.overrides.clone()),
"twitch" => channels.twitch.as_ref().map(|c| c.overrides.clone()),
"rocketchat" => channels.rocketchat.as_ref().map(|c| c.overrides.clone()),
"zulip" => channels.zulip.as_ref().map(|c| c.overrides.clone()),
Expand Down Expand Up @@ -1060,10 +1080,12 @@ fn read_token(env_var_or_token: &str, adapter_name: &str) -> Option<String> {
///
/// Returns `Some(BridgeManager)` if any channels were configured and started,
/// or `None` if no channels are configured.
pub async fn start_channel_bridge(kernel: Arc<OpenFangKernel>) -> Option<BridgeManager> {
pub async fn start_channel_bridge(
kernel: Arc<OpenFangKernel>,
) -> (Option<BridgeManager>, Option<Router<()>>) {
let channels = kernel.config.channels.clone();
let (bridge, _names) = start_channel_bridge_with_config(kernel, &channels).await;
bridge
let (bridge, _names, voice_router) = start_channel_bridge_with_config(kernel, &channels).await;
(bridge, voice_router)
}

/// Start channels from an explicit `ChannelsConfig` (used by hot-reload).
Expand All @@ -1072,7 +1094,7 @@ pub async fn start_channel_bridge(kernel: Arc<OpenFangKernel>) -> Option<BridgeM
pub async fn start_channel_bridge_with_config(
kernel: Arc<OpenFangKernel>,
config: &openfang_types::config::ChannelsConfig,
) -> (Option<BridgeManager>, Vec<String>) {
) -> (Option<BridgeManager>, Vec<String>, Option<Router<()>>) {
let has_any = config.telegram.is_some()
|| config.discord.is_some()
|| config.slack.is_some()
Expand Down Expand Up @@ -1116,12 +1138,15 @@ pub async fn start_channel_bridge_with_config(
|| config.ntfy.is_some()
|| config.gotify.is_some()
|| config.webhook.is_some()
|| config.linkedin.is_some();
|| config.linkedin.is_some()
|| config.voice.is_some();

if !has_any {
return (None, Vec::new());
return (None, Vec::new(), None);
}

let mut voice_router: Option<Router<()>> = None;

let handle = KernelBridgeAdapter {
kernel: kernel.clone(),
started_at: Instant::now(),
Expand Down Expand Up @@ -1304,6 +1329,64 @@ pub async fn start_channel_bridge_with_config(
}
}

// Voice
if let Some(ref voice_config) = config.voice {
let mut adapter = VoiceAdapter::new(
voice_config.listen.clone(),
voice_config.default_agent.clone(),
);
// Attach PCM pipeline if STT + TTS are configured.
// SmartTurnDetector::load() is synchronous and CPU-bound (ONNX model load);
// run it in spawn_blocking so it doesn't stall the async runtime.
if let (Some(stt), Some(tts)) = (voice_config.stt.clone(), voice_config.tts.clone()) {
let smart_turn_cfg = voice_config.smart_turn.clone();
let smart_turn = if let Some(cfg) = smart_turn_cfg {
let load_result = tokio::time::timeout(
std::time::Duration::from_secs(15),
tokio::task::spawn_blocking(move || {
openfang_channels::smart_turn::SmartTurnDetector::load(
&cfg.model_path,
cfg.threshold,
)
.map_err(|e| e.to_string())
}),
)
.await;
match load_result {
Ok(Ok(Ok(detector))) => Some(detector),
Ok(Ok(Err(e))) => {
warn!("Smart Turn model failed to load, will use silence detection: {e}");
None
}
Ok(Err(e)) => {
warn!("Smart Turn load task panicked: {e}");
None
}
Err(_) => {
warn!(
"Smart Turn model load timed out after 15s, will use silence detection"
);
None
}
}
} else {
None
};
adapter = adapter.with_pipeline(
stt,
tts,
smart_turn,
voice_config.barge_in_threshold,
voice_config.barge_in_speaking_threshold,
);
}
// Extract the /voice WebSocket router before wrapping in Arc<dyn>.
// This is merged into the main API server so voice is reachable through
// the same port as the REST API — no separate port exposure needed.
voice_router = Some(adapter.make_router());
adapters.push((Arc::new(adapter), voice_config.default_agent.clone()));
}

// Twitch
if let Some(ref tw_config) = config.twitch {
if let Some(token) = read_token(&tw_config.oauth_token_env, "Twitch") {
Expand Down Expand Up @@ -1700,7 +1783,7 @@ pub async fn start_channel_bridge_with_config(
}

if adapters.is_empty() {
return (None, Vec::new());
return (None, Vec::new(), None);
}

// Resolve per-channel default agents AND set the first one as system-wide fallback
Expand Down Expand Up @@ -1780,9 +1863,9 @@ pub async fn start_channel_bridge_with_config(
}

if started_names.is_empty() {
(None, Vec::new())
(None, Vec::new(), None)
} else {
(Some(manager), started_names)
(Some(manager), started_names, voice_router)
}
}

Expand Down Expand Up @@ -1839,7 +1922,9 @@ pub async fn reload_channels_from_disk(
*state.channels_config.write().await = fresh_config.channels.clone();

// Start new bridge with fresh channel config
let (new_bridge, started) =
// Note: voice_router is ignored on hot-reload — the main Axum router is
// immutable after startup. Voice remains accessible via the existing route.
let (new_bridge, started, _voice_router) =
start_channel_bridge_with_config(state.kernel.clone(), &fresh_config.channels).await;

// Store the new bridge
Expand Down Expand Up @@ -1870,6 +1955,7 @@ mod tests {
assert!(config.channels.mattermost.is_none());
assert!(config.channels.irc.is_none());
assert!(config.channels.google_chat.is_none());
assert!(config.channels.voice.is_none());
assert!(config.channels.twitch.is_none());
assert!(config.channels.rocketchat.is_none());
assert!(config.channels.zulip.is_none());
Expand Down
40 changes: 40 additions & 0 deletions crates/openfang-api/src/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2208,6 +2208,38 @@ const CHANNEL_REGISTRY: &[ChannelMeta] = &[
setup_steps: &["Create a WeCom application at work.weixin.qq.com", "Get Corp ID, Agent ID, and Secret", "Configure callback URL to your webhook endpoint"],
config_template: "[channels.wecom]\ncorp_id = \"\"\nagent_id = \"\"\nsecret_env = \"WECOM_SECRET\"",
},
// ── Voice / Audio ───────────────────────────────────────────────────────
ChannelMeta {
name: "voice", display_name: "Voice", icon: "🎙",
description: "WebSocket voice adapter — text mode (client handles STT/TTS) or PCM mode (server-side pipeline with Deepgram/Cartesia/OpenAI)",
category: "developer", difficulty: "Easy", setup_time: "~5 min",
quick_setup: "Speak to your agent via the web UI or any WebSocket client",
setup_type: "form",
fields: &[
ChannelField { key: "listen", label: "Listen Address", field_type: FieldType::Text, env_var: None, required: false, placeholder: "0.0.0.0:4201", advanced: false },
ChannelField { key: "default_agent", label: "Default Agent", field_type: FieldType::Text, env_var: None, required: false, placeholder: "assistant", advanced: false },
// STT
ChannelField { key: "stt.provider", label: "STT Provider", field_type: FieldType::Text, env_var: None, required: false, placeholder: "deepgram", advanced: false },
ChannelField { key: "stt.api_key", label: "STT API Key", field_type: FieldType::Secret, env_var: Some("DEEPGRAM_API_KEY"), required: false, placeholder: "Token ...", advanced: false },
ChannelField { key: "stt.language", label: "STT Language", field_type: FieldType::Text, env_var: None, required: false, placeholder: "en", advanced: true },
ChannelField { key: "stt.model", label: "STT Model", field_type: FieldType::Text, env_var: None, required: false, placeholder: "nova-3", advanced: true },
// TTS
ChannelField { key: "tts.provider", label: "TTS Provider", field_type: FieldType::Text, env_var: None, required: false, placeholder: "cartesia", advanced: false },
ChannelField { key: "tts.api_key", label: "TTS API Key", field_type: FieldType::Secret, env_var: Some("CARTESIA_API_KEY"), required: false, placeholder: "sk-...", advanced: false },
ChannelField { key: "tts.voice_id", label: "Voice ID", field_type: FieldType::Text, env_var: None, required: false, placeholder: "...", advanced: false },
ChannelField { key: "tts.model", label: "TTS Model", field_type: FieldType::Text, env_var: None, required: false, placeholder: "sonic-2", advanced: true },
ChannelField { key: "tts.speed", label: "Speed", field_type: FieldType::Number, env_var: None, required: false, placeholder: "1.0", advanced: true },
// Smart Turn
ChannelField { key: "smart_turn.model_path", label: "Smart Turn Model Path", field_type: FieldType::Text, env_var: None, required: false, placeholder: "/data/models/smart-turn-v3.2.onnx", advanced: true },
ChannelField { key: "smart_turn.threshold", label: "Smart Turn Threshold", field_type: FieldType::Number, env_var: None, required: false, placeholder: "0.5", advanced: true },
],
setup_steps: &[
"Set Listen Address (default 0.0.0.0:4201)",
"For PCM mode: configure STT and TTS providers",
"Use the web UI voice mode or connect via WebSocket",
],
config_template: "[channels.voice]\nlisten = \"0.0.0.0:4201\"\ndefault_agent = \"assistant\"\n\n[channels.voice.stt]\nprovider = \"deepgram\"\napi_key = \"\"\n\n[channels.voice.tts]\nprovider = \"cartesia\"\napi_key = \"\"\nvoice_id = \"\"\n\n[channels.voice.smart_turn]\nmodel_path = \"/data/models/smart-turn-v3.2.onnx\"\nthreshold = 0.5",
},
];

/// Check if a channel is configured (has a `[channels.xxx]` section in config).
Expand Down Expand Up @@ -5213,6 +5245,13 @@ pub async fn list_tools(State(state): State<Arc<AppState>>) -> impl IntoResponse
pub async fn get_config(State(state): State<Arc<AppState>>) -> impl IntoResponse {
// Return a redacted view of the kernel config
let config = &state.kernel.config;
let voice_pcm_enabled = config
.channels
.voice
.as_ref()
.map(|v| v.stt.is_some() && v.tts.is_some())
.unwrap_or(false);

Json(serde_json::json!({
"home_dir": config.home_dir.to_string_lossy(),
"data_dir": config.data_dir.to_string_lossy(),
Expand All @@ -5225,6 +5264,7 @@ pub async fn get_config(State(state): State<Arc<AppState>>) -> impl IntoResponse
"memory": {
"decay_rate": config.memory.decay_rate,
},
"voice_pcm_enabled": voice_pcm_enabled,
}))
}

Expand Down
15 changes: 13 additions & 2 deletions crates/openfang-api/src/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,10 @@ pub async fn build_router(
kernel: Arc<OpenFangKernel>,
listen_addr: SocketAddr,
) -> (Router<()>, Arc<AppState>) {
// Start channel bridges (Telegram, etc.)
let bridge = channel_bridge::start_channel_bridge(kernel.clone()).await;
// Start channel bridges (Telegram, voice, etc.)
// voice_router: the /voice WebSocket handler merged into the main router so
// a single reverse-proxy rule covers both REST and voice WebSocket.
let (bridge, voice_router) = channel_bridge::start_channel_bridge(kernel.clone()).await;

let channels_config = kernel.config.channels.clone();
let state = Arc::new(AppState {
Expand Down Expand Up @@ -736,6 +738,15 @@ pub async fn build_router(
.layer(cors)
.with_state(state.clone());

// Merge the /voice WebSocket handler (already has its own state embedded)
// AFTER applying middleware so voice bypasses auth/rate-limiting, matching
// the behaviour of the standalone voice server on port 4201.
let app = if let Some(vr) = voice_router {
app.merge(vr)
} else {
app
};

(app, state)
}

Expand Down
6 changes: 5 additions & 1 deletion crates/openfang-api/src/webchat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ pub async fn webchat_page() -> impl IntoResponse {
let html = WEBCHAT_HTML.replace(NONCE_PLACEHOLDER, &nonce);
let csp = format!(
"default-src 'self'; \
script-src 'self' 'nonce-{nonce}' 'unsafe-eval'; \
script-src 'self' 'nonce-{nonce}' 'unsafe-eval' blob:; \
style-src 'self' 'unsafe-inline' https://fonts.googleapis.com https://fonts.gstatic.com; \
img-src 'self' data: blob:; \
connect-src 'self' ws://localhost:* ws://127.0.0.1:* wss://localhost:* wss://127.0.0.1:*; \
Expand All @@ -109,6 +109,10 @@ pub async fn webchat_page() -> impl IntoResponse {
csp,
),
(header::CACHE_CONTROL, "no-store".to_string()),
(
header::HeaderName::from_static("permissions-policy"),
"microphone=*, camera=()".to_string(),
),
],
html,
)
Expand Down
Loading
Loading