diff --git a/app/ai/voice/agents/breeze_buddy/agent/__init__.py b/app/ai/voice/agents/breeze_buddy/agent/__init__.py index 4f9f0caf7..5325a5e03 100644 --- a/app/ai/voice/agents/breeze_buddy/agent/__init__.py +++ b/app/ai/voice/agents/breeze_buddy/agent/__init__.py @@ -914,6 +914,7 @@ async def run(self, runner_args: Optional[RunnerArguments] = None) -> None: None if is_stream else self._handle_user_idle_timeout ), mode="stream" if is_stream else "agent", + is_daily_mode=self.is_daily_mode, ) self._context_aggregator = context_aggregator diff --git a/app/ai/voice/agents/breeze_buddy/agent/pipeline.py b/app/ai/voice/agents/breeze_buddy/agent/pipeline.py index 243e5de35..45c167921 100644 --- a/app/ai/voice/agents/breeze_buddy/agent/pipeline.py +++ b/app/ai/voice/agents/breeze_buddy/agent/pipeline.py @@ -43,6 +43,7 @@ from app.ai.voice.agents.breeze_buddy.llm import get_llm_service from app.ai.voice.agents.breeze_buddy.observability.tracing_setup import setup_tracing from app.ai.voice.agents.breeze_buddy.processors import ( + AudioPreBufferProcessor, TranscriptCollectorProcessor, TranscriptionGateProcessor, UserIdleCallbackHandler, @@ -199,6 +200,7 @@ async def build_pipeline( configurations: Optional[ConfigurationModel] = None, on_user_idle_timeout: Optional[Callable[[int], Any]] = None, mode: Literal["agent", "stream"] = "agent", + is_daily_mode: bool = False, ) -> tuple[ Pipeline, LLMContext, @@ -475,6 +477,14 @@ async def build_pipeline( # UserTurnStrategies — no custom response gate needed. # Note: RTVIProcessor is added automatically by PipelineTask (pipecat v0.0.102+) # when enable_rtvi=True (default). No need to add it to the pipeline manually. + + # Audio pre-buffer for Daily mode: sits between TTS and transport output. + # Buffers the first few audio frames per bot turn to give the Daily SDK's + # WebRTC play cursor a head start, preventing initial-buffer starvation gaps. + audio_pre_buffer: Optional[AudioPreBufferProcessor] = None + if is_daily_mode and not is_stream: + audio_pre_buffer = AudioPreBufferProcessor(pre_buffer_count=3) + pipeline_parts: list[Any] = [transport.input(), stt, transcription_gate] if is_stream: assert transcript_collector is not None @@ -483,9 +493,12 @@ async def build_pipeline( if is_stream: pipeline_parts.extend([tts, transport.output()]) else: - pipeline_parts.extend( - [llm, tts, transport.output(), context_aggregator.assistant()] - ) + pipeline_parts.append(llm) + pipeline_parts.append(tts) + if audio_pre_buffer: + pipeline_parts.append(audio_pre_buffer) + pipeline_parts.append(transport.output()) + pipeline_parts.append(context_aggregator.assistant()) return ( Pipeline(pipeline_parts), diff --git a/app/ai/voice/agents/breeze_buddy/agent/transport.py b/app/ai/voice/agents/breeze_buddy/agent/transport.py index a138c2cb3..841b749e0 100644 --- a/app/ai/voice/agents/breeze_buddy/agent/transport.py +++ b/app/ai/voice/agents/breeze_buddy/agent/transport.py @@ -101,6 +101,7 @@ def get_transport_params( audio_out_enabled=True, audio_in_filter=audio_in_filter, audio_out_mixer=daily_mixer, + audio_out_10ms_chunks=2, # 20ms chunks for smoother Daily WebRTC cadence ), "twilio": lambda: FastAPIWebsocketParams( audio_in_enabled=True, diff --git a/app/ai/voice/agents/breeze_buddy/processors/__init__.py b/app/ai/voice/agents/breeze_buddy/processors/__init__.py index 1776e18f2..f8b74b785 100644 --- a/app/ai/voice/agents/breeze_buddy/processors/__init__.py +++ b/app/ai/voice/agents/breeze_buddy/processors/__init__.py @@ -1,5 +1,8 @@ """Breeze Buddy custom processors for pipeline control.""" +from app.ai.voice.agents.breeze_buddy.processors.audio_pre_buffer import ( + AudioPreBufferProcessor, +) from app.ai.voice.agents.breeze_buddy.processors.transcript_collector import ( TranscriptCollectorProcessor, ) @@ -11,6 +14,7 @@ ) __all__ = [ + "AudioPreBufferProcessor", "TranscriptCollectorProcessor", "TranscriptionGateProcessor", "UserIdleCallbackHandler", diff --git a/app/ai/voice/agents/breeze_buddy/processors/audio_pre_buffer.py b/app/ai/voice/agents/breeze_buddy/processors/audio_pre_buffer.py new file mode 100644 index 000000000..804fe3d41 --- /dev/null +++ b/app/ai/voice/agents/breeze_buddy/processors/audio_pre_buffer.py @@ -0,0 +1,70 @@ +"""Audio pre-buffer processor for Daily WebRTC output. + +Buffers the first N audio frames from TTS before releasing them downstream +to the transport output. This gives the Daily SDK's internal WebRTC buffer +a head start, preventing play-cursor starvation during the initial burst of +TTS audio. + +Once the pre-buffer is full, all buffered frames are flushed at once, and +subsequent frames pass through immediately with no additional latency. + +This processor is only useful for Daily (WebRTC) mode where the +without_mixer audio path has no built-in pacing or jitter buffer. +""" + +from pipecat.frames.frames import ( + BotStoppedSpeakingFrame, + Frame, + OutputAudioRawFrame, +) +from pipecat.processors.frame_processor import FrameDirection, FrameProcessor + + + +class AudioPreBufferProcessor(FrameProcessor): + """Buffers the first N audio frames per bot turn, then passes through. + + Pipeline position: between TTS and transport.output() + ... → tts → AudioPreBufferProcessor → transport.output() → ... + + The buffer resets at the start of each new bot speaking turn so that + every response gets the same initial runway. + """ + + def __init__(self, pre_buffer_count: int = 3, **kwargs): + super().__init__(**kwargs) + self._pre_buffer_count = pre_buffer_count + self._buffer: list[OutputAudioRawFrame] = [] + self._buffering = True + + async def process_frame(self, frame: Frame, direction: FrameDirection): + await super().process_frame(frame, direction) + + if isinstance(frame, OutputAudioRawFrame): + if self._buffering: + self._buffer.append(frame) + if len(self._buffer) >= self._pre_buffer_count: + # Flush all buffered frames at once + for buffered_frame in self._buffer: + await self.push_frame(buffered_frame, direction) + self._buffer.clear() + self._buffering = False + # Don't push this frame yet while buffering + return + # After initial buffer filled, pass through immediately + await self.push_frame(frame, direction) + + elif isinstance(frame, BotStoppedSpeakingFrame): + # Reset for next turn — buffer the start of the next response + if self._buffer: + # Flush any remaining buffered frames (edge case: turn ended + # before buffer was full) + for buffered_frame in self._buffer: + await self.push_frame(buffered_frame, direction) + self._buffer.clear() + self._buffering = True + await self.push_frame(frame, direction) + + else: + # All non-audio, non-reset frames pass through immediately + await self.push_frame(frame, direction) diff --git a/docs/TTS_AUDIO_GAP_ROOT_CAUSE_ANALYSIS.md b/docs/TTS_AUDIO_GAP_ROOT_CAUSE_ANALYSIS.md new file mode 100644 index 000000000..32a050086 --- /dev/null +++ b/docs/TTS_AUDIO_GAP_ROOT_CAUSE_ANALYSIS.md @@ -0,0 +1,213 @@ +# TTS Audio Gap Root Cause Analysis — BB Daily Mode + +**Date**: 2026-05-05 +**Branch**: add-filler-words-and-function-call-bg-music +**Status**: Post-rebase analysis (pipecat-ai 1.1.0, daily-python 0.28.0) + +--- + +## Problem Statement + +TTS audio has audible gaps/breaks in Breeze Buddy's Daily (WebRTC) mode. The issue is: +- TTS-provider-agnostic (ElevenLabs + Cartesia both affected) +- Occurs even with NO background sound mixer and NO function call background music +- Affects Daily mode significantly more than telephony (Twilio/Plivo/Exotel) +- Affects BB more than the Automatic agent + +--- + +## Diagnostic Tests Run + +13 tests across 6 test classes in `tests/test_audio_gap_diagnosis.py`. All pass. + +| Test Class | What It Tests | Result | +|------------|---------------|--------| +| TestEventLoopContention | Does BB's extra tasks starve audio output? | **ELIMINATED** — max gap 1.1ms even with 80 competing tasks | +| TestSOXRResampler | Does resampler state clearing cause discontinuities? | **ELIMINATED** — discontinuity ratio 0.19 (threshold 2.0) | +| TestDailySDKCallbackLatency | Does `call_soon` / `write_frames` callback delay? | **ELIMINATED** — P99 0.35ms under 30 competing tasks | +| TestBaselinePipelineJitter | Baseline frame delivery jitter | **ELIMINATED** — sub-ms jitter in ideal conditions | +| TestInterContextSilence | 500ms silence between TTS audio contexts | **CONFIRMED** (now fixed by pipecat 1.1.0) | +| TestAggregateSentencesImpact | aggregate_sentences creates more context boundaries | **REVISED** — not a cause post-rebase (sentence aggregation is beneficial) | + +--- + +## Root Causes + +### FIXED by Rebase (pipecat-ai 0.0.102 → 1.1.0, daily-python 0.23.0 → 0.28.0) + +#### RC-1: 500ms Inter-Context Silence — FIXED +**Severity**: HIGH — was the #1 cause of audible gaps +**Status**: FIXED in pipecat-ai 1.1.0 + +The old `AudioContextTTSService._audio_context_task_handler` (tts_service.py:1171) appended 500ms of silence (`b"\x00" * self.sample_rate`) between every TTS audio context (sentence). BB's structured LLM responses produce many short contexts → many 500ms pauses. + +In pipecat-ai 1.1.0: +- The `silence = b"\x00" * self.sample_rate` line is completely removed +- `AudioContextTTSService` is deprecated (now a thin wrapper around `WebsocketTTSService`) +- No inter-context silence is injected + +#### RC-2: Outdated daily-python SDK — FIXED +**Severity**: HIGH — missing gapless audio improvements +**Status**: FIXED — daily-python now at 0.28.0 + +Was 5 versions behind (0.23.0). Version 0.28.0 includes "gapless audio for raw-tracks" (Daily changelog #073, Dec 2025) with WebRTC audio pipeline improvements for raw audio tracks. + +--- + +### STILL PRESENT (Active Root Causes) + +#### RC-3: No Audio Pacing in `without_mixer` Path — ACTIVE +**Severity**: MEDIUM-HIGH — likely the primary remaining cause +**Status**: NOT FIXED in pipecat-ai 1.1.0 + +**File**: `base_output.py:761` (in pipecat-ai package) +**Code**: +```python +async def without_mixer(vad_stop_secs: float) -> AsyncGenerator[Frame, None]: + while True: + try: + frame = await asyncio.wait_for( + self._audio_queue.get(), timeout=vad_stop_secs + ) + yield frame + self._audio_queue.task_done() + except TimeoutError: + await self._bot_stopped_speaking() +``` + +When no background sound mixer is configured (most BB Daily calls), the output path has **zero pacing**: +- Frames are pulled from queue and written to `CustomAudioSource.write_frames()` as fast as they arrive +- No jitter buffer, no pre-buffering, no rate control +- TTS produces audio in bursts → Daily SDK's WebRTC play cursor catches up during delivery delays → gaps +- The `with_mixer` path at least has `await asyncio.sleep(0)` between iterations; `without_mixer` has nothing +- Default chunk size is 40ms (`audio_out_10ms_chunks=4`) — if one write is delayed, the gap is proportionally larger + +**Why Automatic is less affected**: Automatic typically has `allow_interruptions=True` and a simpler pipeline with fewer processors. Fewer processors = frames flow through faster = less bursty delivery. + +**Audio flow**: +``` +TTS audio frames + → BaseOutputTransport.MediaSender.handle_audio_frame() + → resample + buffer in bytearray + → chunk into 40ms pieces (4 x 10ms default) + → put into _audio_queue + → _next_frame() / without_mixer + → yield immediately (NO pacing) + → _audio_task_handler() + → DailyOutputTransport.write_audio_frame() + → CustomAudioSource.write_frames() [daily-python native] + → await completion callback +``` + +--- + +### NOT A ROOT CAUSE (Revised) + +#### `aggregate_sentences=True` — NOT a cause (beneficial to keep) +Keeping `aggregate_sentences=True` is actually better now that the 500ms inter-context silence is gone: +- Each sentence produces one continuous audio stream (fewer fragment boundaries) +- Sentence-level prosody optimization = better TTS quality +- Fewer TTS requests overall (one per sentence vs one per token) +- Setting `aggregate_sentences=False` would create MORE fragment boundaries with startup overhead per fragment, potentially worsening gaps + +--- + +### ELIMINATED (Not Root Causes) + +| Cause | Evidence | +|-------|----------| +| Silero VAD blocking event loop | `analyze_audio()` uses `run_in_executor` in both BB and Automatic — already offloaded to thread pool | +| Event loop contention from BB's extra tasks | Max gap 1.1ms even with 80 competing tasks (test confirmed) | +| SOXR resampler discontinuity | Discontinuity ratio 0.19, well below 2.0 threshold | +| Daily SDK callback latency | P99 0.35ms under 30 competing tasks | +| `aggregate_sentences=True` bursty delivery | Not a cause post-rebase — sentence-level aggregation produces smoother continuous audio | + +--- + +## Fixes — Ordered by Priority + +### Fix 1: Reduce Audio Chunk Size (Priority: HIGH, Effort: TRIVIAL) + +**What**: Set `audio_out_10ms_chunks=2` in DailyParams (20ms chunks instead of 40ms) +**Why**: Smaller, more frequent writes to Daily SDK → smoother cadence. If one write is delayed by 10ms, the gap is only 20ms total instead of 40ms — proportionally less audible. Same audio content, same playback speed, just smaller write granularity. +**File**: `app/ai/voice/agents/breeze_buddy/agent/transport.py` +**Change**: +```python +# Before +"daily": lambda: DailyParams( + audio_in_enabled=True, + audio_out_enabled=True, + audio_in_filter=audio_in_filter, + audio_out_mixer=daily_mixer, +), + +# After +"daily": lambda: DailyParams( + audio_in_enabled=True, + audio_out_enabled=True, + audio_in_filter=audio_in_filter, + audio_out_mixer=daily_mixer, + audio_out_10ms_chunks=2, # 20ms chunks for smoother Daily WebRTC cadence +), +``` +**Risk**: More `write_frames` calls per second (50 vs 25), slightly more overhead. Negligible. +**Rollback**: Remove the parameter (defaults to 4) + +--- + +### Fix 2: Add Pre-Buffering Before Audio Output (Priority: MEDIUM, Effort: MEDIUM) + +**What**: Buffer 2-3 audio chunks (40-60ms) before starting to write to Daily SDK +**Why**: Gives the Daily SDK a head start so small delivery delays don't starve the WebRTC play cursor. The SDK has an internal buffer that absorbs jitter once it has some runway — the problem is the initial burst where the play cursor starts immediately with zero buffer. +**File**: New file `app/ai/voice/agents/breeze_buddy/utils/audio_pacing.py` +**Approach**: Wrap the Daily transport's `write_audio_frame` to buffer initial chunks before streaming +**Risk**: Adds 40-60ms initial TTS latency; acceptable for voice calls +**Rollback**: Remove the wrapper + +--- + +### Fix 3: Pipeline-Level Pacing via `asyncio.sleep` (Priority: LOW, Effort: LOW-MEDIUM) + +**What**: Add a minimal `asyncio.sleep(0)` yield in the `without_mixer` path (same as `with_mixer` path) +**Why**: Gives other tasks a chance to run between audio frame deliveries, preventing burst delivery +**File**: Monkey-patch or subclass `BaseOutputTransport.MediaSender._next_frame` in pipecat +**Approach**: +```python +async def without_mixer(vad_stop_secs): + while True: + try: + frame = await asyncio.wait_for( + self._audio_queue.get(), timeout=vad_stop_secs + ) + yield frame + self._audio_queue.task_done() + await asyncio.sleep(0) # yield to event loop + except TimeoutError: + await self._bot_stopped_speaking() +``` +**Risk**: Minimal — `sleep(0)` just yields control to other tasks; doesn't delay audio +**Rollback**: Remove the patch + +--- + +## Verification Plan + +After each fix: +1. Run diagnostic tests: `.venv/bin/python -m pytest tests/test_audio_gap_diagnosis.py -v -s` +2. Test with a real Daily call (playground mode) — listen for gaps +3. Test with a telephony call — verify no regression +4. Test with each TTS provider (ElevenLabs, Cartesia) + +## Recommended Implementation Order + +1. **Fix 1** (chunk size) — trivial one-liner, safe, immediate +2. **Fix 2** (pre-buffering) — more involved, test after above +3. **Fix 3** (asyncio.sleep yield) — lowest priority, least certain impact + +## Version History + +| Component | Before Rebase | After Rebase | +|-----------|--------------|-------------| +| pipecat-ai | 0.0.102 | 1.1.0 | +| pipecat-ai-flows | 0.0.22 | 1.0.0 | +| daily-python | 0.23.0 | 0.28.0 |