Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions app/ai/voice/agents/breeze_buddy/agent/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,7 @@ async def run(self, runner_args: Optional[RunnerArguments] = None) -> None:
None if is_stream else self._handle_user_idle_timeout
),
mode="stream" if is_stream else "agent",
is_daily_mode=self.is_daily_mode,
)
self._context_aggregator = context_aggregator

Expand Down
19 changes: 16 additions & 3 deletions app/ai/voice/agents/breeze_buddy/agent/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from app.ai.voice.agents.breeze_buddy.llm import get_llm_service
from app.ai.voice.agents.breeze_buddy.observability.tracing_setup import setup_tracing
from app.ai.voice.agents.breeze_buddy.processors import (
AudioPreBufferProcessor,
TranscriptCollectorProcessor,
TranscriptionGateProcessor,
UserIdleCallbackHandler,
Expand Down Expand Up @@ -199,6 +200,7 @@ async def build_pipeline(
configurations: Optional[ConfigurationModel] = None,
on_user_idle_timeout: Optional[Callable[[int], Any]] = None,
mode: Literal["agent", "stream"] = "agent",
is_daily_mode: bool = False,
) -> tuple[
Pipeline,
LLMContext,
Expand Down Expand Up @@ -475,6 +477,14 @@ async def build_pipeline(
# UserTurnStrategies — no custom response gate needed.
# Note: RTVIProcessor is added automatically by PipelineTask (pipecat v0.0.102+)
# when enable_rtvi=True (default). No need to add it to the pipeline manually.

# Audio pre-buffer for Daily mode: sits between TTS and transport output.
# Buffers the first few audio frames per bot turn to give the Daily SDK's
# WebRTC play cursor a head start, preventing initial-buffer starvation gaps.
audio_pre_buffer: Optional[AudioPreBufferProcessor] = None
if is_daily_mode and not is_stream:
audio_pre_buffer = AudioPreBufferProcessor(pre_buffer_count=3)

pipeline_parts: list[Any] = [transport.input(), stt, transcription_gate]
if is_stream:
assert transcript_collector is not None
Expand All @@ -483,9 +493,12 @@ async def build_pipeline(
if is_stream:
pipeline_parts.extend([tts, transport.output()])
else:
pipeline_parts.extend(
[llm, tts, transport.output(), context_aggregator.assistant()]
)
pipeline_parts.append(llm)
pipeline_parts.append(tts)
if audio_pre_buffer:
pipeline_parts.append(audio_pre_buffer)
pipeline_parts.append(transport.output())
pipeline_parts.append(context_aggregator.assistant())

return (
Pipeline(pipeline_parts),
Expand Down
1 change: 1 addition & 0 deletions app/ai/voice/agents/breeze_buddy/agent/transport.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def get_transport_params(
audio_out_enabled=True,
audio_in_filter=audio_in_filter,
audio_out_mixer=daily_mixer,
audio_out_10ms_chunks=2, # 20ms chunks for smoother Daily WebRTC cadence
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
Expand Down
4 changes: 4 additions & 0 deletions app/ai/voice/agents/breeze_buddy/processors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
"""Breeze Buddy custom processors for pipeline control."""

from app.ai.voice.agents.breeze_buddy.processors.audio_pre_buffer import (
AudioPreBufferProcessor,
)
from app.ai.voice.agents.breeze_buddy.processors.transcript_collector import (
TranscriptCollectorProcessor,
)
Expand All @@ -11,6 +14,7 @@
)

__all__ = [
"AudioPreBufferProcessor",
"TranscriptCollectorProcessor",
"TranscriptionGateProcessor",
"UserIdleCallbackHandler",
Expand Down
70 changes: 70 additions & 0 deletions app/ai/voice/agents/breeze_buddy/processors/audio_pre_buffer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Audio pre-buffer processor for Daily WebRTC output.

Buffers the first N audio frames from TTS before releasing them downstream
to the transport output. This gives the Daily SDK's internal WebRTC buffer
a head start, preventing play-cursor starvation during the initial burst of
TTS audio.

Once the pre-buffer is full, all buffered frames are flushed at once, and
subsequent frames pass through immediately with no additional latency.

This processor is only useful for Daily (WebRTC) mode where the
without_mixer audio path has no built-in pacing or jitter buffer.
"""

from pipecat.frames.frames import (
BotStoppedSpeakingFrame,
Frame,
OutputAudioRawFrame,
)
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor



class AudioPreBufferProcessor(FrameProcessor):
Comment on lines +20 to +24
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical | ⚡ Quick win

Fix the extra blank line to unblock CI (Black formatting failure).

There are three consecutive blank lines between the import block and the class definition (lines 21–23). Black requires exactly two, and the pipeline fails on this file.

🛠️ Proposed fix
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor


-
 class AudioPreBufferProcessor(FrameProcessor):
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
class AudioPreBufferProcessor(FrameProcessor):
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
class AudioPreBufferProcessor(FrameProcessor):
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@app/ai/voice/agents/breeze_buddy/processors/audio_pre_buffer.py` around lines
20 - 24, Remove the extra blank line between the import block and the class
definition so Black formatting passes: ensure there are exactly two consecutive
newlines between the import statements (including "from
pipecat.processors.frame_processor import FrameDirection, FrameProcessor") and
the "class AudioPreBufferProcessor(FrameProcessor):" declaration by deleting one
of the three blank lines.

"""Buffers the first N audio frames per bot turn, then passes through.

Pipeline position: between TTS and transport.output()
... → tts → AudioPreBufferProcessor → transport.output() → ...

The buffer resets at the start of each new bot speaking turn so that
every response gets the same initial runway.
"""

def __init__(self, pre_buffer_count: int = 3, **kwargs):
super().__init__(**kwargs)
self._pre_buffer_count = pre_buffer_count
self._buffer: list[OutputAudioRawFrame] = []
self._buffering = True

async def process_frame(self, frame: Frame, direction: FrameDirection):
await super().process_frame(frame, direction)

if isinstance(frame, OutputAudioRawFrame):
if self._buffering:
self._buffer.append(frame)
if len(self._buffer) >= self._pre_buffer_count:
# Flush all buffered frames at once
for buffered_frame in self._buffer:
await self.push_frame(buffered_frame, direction)
self._buffer.clear()
self._buffering = False
# Don't push this frame yet while buffering
return
# After initial buffer filled, pass through immediately
await self.push_frame(frame, direction)

elif isinstance(frame, BotStoppedSpeakingFrame):
# Reset for next turn — buffer the start of the next response
if self._buffer:
# Flush any remaining buffered frames (edge case: turn ended
# before buffer was full)
for buffered_frame in self._buffer:
await self.push_frame(buffered_frame, direction)
self._buffer.clear()
self._buffering = True
await self.push_frame(frame, direction)

else:
# All non-audio, non-reset frames pass through immediately
await self.push_frame(frame, direction)
213 changes: 213 additions & 0 deletions docs/TTS_AUDIO_GAP_ROOT_CAUSE_ANALYSIS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
# TTS Audio Gap Root Cause Analysis — BB Daily Mode

**Date**: 2026-05-05
**Branch**: add-filler-words-and-function-call-bg-music
**Status**: Post-rebase analysis (pipecat-ai 1.1.0, daily-python 0.28.0)

---

## Problem Statement

TTS audio has audible gaps/breaks in Breeze Buddy's Daily (WebRTC) mode. The issue is:
- TTS-provider-agnostic (ElevenLabs + Cartesia both affected)
- Occurs even with NO background sound mixer and NO function call background music
- Affects Daily mode significantly more than telephony (Twilio/Plivo/Exotel)
- Affects BB more than the Automatic agent

---

## Diagnostic Tests Run

13 tests across 6 test classes in `tests/test_audio_gap_diagnosis.py`. All pass.

| Test Class | What It Tests | Result |
|------------|---------------|--------|
| TestEventLoopContention | Does BB's extra tasks starve audio output? | **ELIMINATED** — max gap 1.1ms even with 80 competing tasks |
| TestSOXRResampler | Does resampler state clearing cause discontinuities? | **ELIMINATED** — discontinuity ratio 0.19 (threshold 2.0) |
| TestDailySDKCallbackLatency | Does `call_soon` / `write_frames` callback delay? | **ELIMINATED** — P99 0.35ms under 30 competing tasks |
| TestBaselinePipelineJitter | Baseline frame delivery jitter | **ELIMINATED** — sub-ms jitter in ideal conditions |
| TestInterContextSilence | 500ms silence between TTS audio contexts | **CONFIRMED** (now fixed by pipecat 1.1.0) |
| TestAggregateSentencesImpact | aggregate_sentences creates more context boundaries | **REVISED** — not a cause post-rebase (sentence aggregation is beneficial) |

Comment on lines +23 to +31
---

## Root Causes

### FIXED by Rebase (pipecat-ai 0.0.102 → 1.1.0, daily-python 0.23.0 → 0.28.0)

#### RC-1: 500ms Inter-Context Silence — FIXED
**Severity**: HIGH — was the #1 cause of audible gaps
**Status**: FIXED in pipecat-ai 1.1.0

The old `AudioContextTTSService._audio_context_task_handler` (tts_service.py:1171) appended 500ms of silence (`b"\x00" * self.sample_rate`) between every TTS audio context (sentence). BB's structured LLM responses produce many short contexts → many 500ms pauses.

In pipecat-ai 1.1.0:
- The `silence = b"\x00" * self.sample_rate` line is completely removed
- `AudioContextTTSService` is deprecated (now a thin wrapper around `WebsocketTTSService`)
- No inter-context silence is injected

#### RC-2: Outdated daily-python SDK — FIXED
**Severity**: HIGH — missing gapless audio improvements
**Status**: FIXED — daily-python now at 0.28.0

Was 5 versions behind (0.23.0). Version 0.28.0 includes "gapless audio for raw-tracks" (Daily changelog #073, Dec 2025) with WebRTC audio pipeline improvements for raw audio tracks.

---

### STILL PRESENT (Active Root Causes)

#### RC-3: No Audio Pacing in `without_mixer` Path — ACTIVE
**Severity**: MEDIUM-HIGH — likely the primary remaining cause
**Status**: NOT FIXED in pipecat-ai 1.1.0

**File**: `base_output.py:761` (in pipecat-ai package)
**Code**:
```python
async def without_mixer(vad_stop_secs: float) -> AsyncGenerator[Frame, None]:
while True:
try:
frame = await asyncio.wait_for(
self._audio_queue.get(), timeout=vad_stop_secs
)
yield frame
self._audio_queue.task_done()
except TimeoutError:
await self._bot_stopped_speaking()
```

When no background sound mixer is configured (most BB Daily calls), the output path has **zero pacing**:
- Frames are pulled from queue and written to `CustomAudioSource.write_frames()` as fast as they arrive
- No jitter buffer, no pre-buffering, no rate control
- TTS produces audio in bursts → Daily SDK's WebRTC play cursor catches up during delivery delays → gaps
- The `with_mixer` path at least has `await asyncio.sleep(0)` between iterations; `without_mixer` has nothing
- Default chunk size is 40ms (`audio_out_10ms_chunks=4`) — if one write is delayed, the gap is proportionally larger

**Why Automatic is less affected**: Automatic typically has `allow_interruptions=True` and a simpler pipeline with fewer processors. Fewer processors = frames flow through faster = less bursty delivery.

**Audio flow**:
```
TTS audio frames
→ BaseOutputTransport.MediaSender.handle_audio_frame()
→ resample + buffer in bytearray
→ chunk into 40ms pieces (4 x 10ms default)
→ put into _audio_queue
→ _next_frame() / without_mixer
→ yield immediately (NO pacing)
→ _audio_task_handler()
→ DailyOutputTransport.write_audio_frame()
→ CustomAudioSource.write_frames() [daily-python native]
→ await completion callback
```
Comment on lines +87 to +100
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Add a language tag to the fenced code block to suppress the markdownlint MD040 warning.

📝 Proposed fix
-```
+```text
 TTS audio frames
   → BaseOutputTransport.MediaSender.handle_audio_frame()
🧰 Tools
🪛 markdownlint-cli2 (0.22.1)

[warning] 88-88: Fenced code blocks should have a language specified

(MD040, fenced-code-language)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@docs/TTS_AUDIO_GAP_ROOT_CAUSE_ANALYSIS.md` around lines 87 - 100, The fenced
code block containing the audio flow should include a language tag to silence
markdownlint MD040; change the opening fence from ``` to ```text for the block
that starts with "TTS audio frames" (which documents
BaseOutputTransport.MediaSender.handle_audio_frame(), _next_frame(),
_audio_task_handler(), DailyOutputTransport.write_audio_frame(), and
CustomAudioSource.write_frames()) so the block is treated as plain text by the
linter.


---

### NOT A ROOT CAUSE (Revised)

#### `aggregate_sentences=True` — NOT a cause (beneficial to keep)
Keeping `aggregate_sentences=True` is actually better now that the 500ms inter-context silence is gone:
- Each sentence produces one continuous audio stream (fewer fragment boundaries)
- Sentence-level prosody optimization = better TTS quality
- Fewer TTS requests overall (one per sentence vs one per token)
- Setting `aggregate_sentences=False` would create MORE fragment boundaries with startup overhead per fragment, potentially worsening gaps

---

### ELIMINATED (Not Root Causes)

| Cause | Evidence |
|-------|----------|
| Silero VAD blocking event loop | `analyze_audio()` uses `run_in_executor` in both BB and Automatic — already offloaded to thread pool |
| Event loop contention from BB's extra tasks | Max gap 1.1ms even with 80 competing tasks (test confirmed) |
| SOXR resampler discontinuity | Discontinuity ratio 0.19, well below 2.0 threshold |
| Daily SDK callback latency | P99 0.35ms under 30 competing tasks |
| `aggregate_sentences=True` bursty delivery | Not a cause post-rebase — sentence-level aggregation produces smoother continuous audio |

---

## Fixes — Ordered by Priority

### Fix 1: Reduce Audio Chunk Size (Priority: HIGH, Effort: TRIVIAL)

**What**: Set `audio_out_10ms_chunks=2` in DailyParams (20ms chunks instead of 40ms)
**Why**: Smaller, more frequent writes to Daily SDK → smoother cadence. If one write is delayed by 10ms, the gap is only 20ms total instead of 40ms — proportionally less audible. Same audio content, same playback speed, just smaller write granularity.
**File**: `app/ai/voice/agents/breeze_buddy/agent/transport.py`
**Change**:
```python
# Before
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
audio_in_filter=audio_in_filter,
audio_out_mixer=daily_mixer,
),

# After
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
audio_in_filter=audio_in_filter,
audio_out_mixer=daily_mixer,
audio_out_10ms_chunks=2, # 20ms chunks for smoother Daily WebRTC cadence
),
```
**Risk**: More `write_frames` calls per second (50 vs 25), slightly more overhead. Negligible.
**Rollback**: Remove the parameter (defaults to 4)

---

### Fix 2: Add Pre-Buffering Before Audio Output (Priority: MEDIUM, Effort: MEDIUM)

**What**: Buffer 2-3 audio chunks (40-60ms) before starting to write to Daily SDK
**Why**: Gives the Daily SDK a head start so small delivery delays don't starve the WebRTC play cursor. The SDK has an internal buffer that absorbs jitter once it has some runway — the problem is the initial burst where the play cursor starts immediately with zero buffer.
**File**: New file `app/ai/voice/agents/breeze_buddy/utils/audio_pacing.py`
**Approach**: Wrap the Daily transport's `write_audio_frame` to buffer initial chunks before streaming
**Risk**: Adds 40-60ms initial TTS latency; acceptable for voice calls
**Rollback**: Remove the wrapper
Comment on lines +158 to +165
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Fix 2 description is stale — references a file and approach that don't match the actual implementation.

The doc describes an unimplemented transport-wrapper approach at app/ai/voice/agents/breeze_buddy/utils/audio_pacing.py, but the PR actually ships a pipeline FrameProcessor at app/ai/voice/agents/breeze_buddy/processors/audio_pre_buffer.py. Someone reading this document later will look for utils/audio_pacing.py and find nothing.

Consider updating lines 158–165 to reflect what was actually implemented.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@docs/TTS_AUDIO_GAP_ROOT_CAUSE_ANALYSIS.md` around lines 158 - 165, Update the
doc block to reflect the actual implementation: replace references to the
non-existent utils/audio_pacing.py transport-wrapper with the implemented
pipeline FrameProcessor in
app/ai/voice/agents/breeze_buddy/processors/audio_pre_buffer.py (mention the
FrameProcessor/AudioPreBuffer processor name and that it buffers 2–3 initial
audio chunks before forwarding to the Daily SDK rather than wrapping
write_audio_frame). Keep the rest of the text (why, risk, rollback) but change
the "File" and "Approach" lines to point to
app/ai/voice/agents/breeze_buddy/processors/audio_pre_buffer.py and describe the
implemented buffering pipeline stage.


---

### Fix 3: Pipeline-Level Pacing via `asyncio.sleep` (Priority: LOW, Effort: LOW-MEDIUM)

**What**: Add a minimal `asyncio.sleep(0)` yield in the `without_mixer` path (same as `with_mixer` path)
**Why**: Gives other tasks a chance to run between audio frame deliveries, preventing burst delivery
**File**: Monkey-patch or subclass `BaseOutputTransport.MediaSender._next_frame` in pipecat
**Approach**:
```python
async def without_mixer(vad_stop_secs):
while True:
try:
frame = await asyncio.wait_for(
self._audio_queue.get(), timeout=vad_stop_secs
)
yield frame
self._audio_queue.task_done()
await asyncio.sleep(0) # yield to event loop
except TimeoutError:
await self._bot_stopped_speaking()
```
**Risk**: Minimal — `sleep(0)` just yields control to other tasks; doesn't delay audio
**Rollback**: Remove the patch

---

## Verification Plan

After each fix:
1. Run diagnostic tests: `.venv/bin/python -m pytest tests/test_audio_gap_diagnosis.py -v -s`
2. Test with a real Daily call (playground mode) — listen for gaps
3. Test with a telephony call — verify no regression
4. Test with each TTS provider (ElevenLabs, Cartesia)

## Recommended Implementation Order

1. **Fix 1** (chunk size) — trivial one-liner, safe, immediate
2. **Fix 2** (pre-buffering) — more involved, test after above
3. **Fix 3** (asyncio.sleep yield) — lowest priority, least certain impact

## Version History

| Component | Before Rebase | After Rebase |
|-----------|--------------|-------------|
| pipecat-ai | 0.0.102 | 1.1.0 |
| pipecat-ai-flows | 0.0.22 | 1.0.0 |
| daily-python | 0.23.0 | 0.28.0 |
Loading