juspay · Swetha-160303 · Apr 30, 2026 · Copilot · Apr 30, 2026 · Copilot
diff --git a/app/ai/voice/agents/breeze_buddy/agent/__init__.py b/app/ai/voice/agents/breeze_buddy/agent/__init__.py
@@ -65,6 +65,13 @@
 from app.ai.voice.agents.breeze_buddy.services.telephony.base_provider import (
     VoiceCallProvider,
 )
+from app.ai.voice.agents.breeze_buddy.stt.fallback import (
+    ALERT_STT_TERMINAL_FAILURE,
+    STT_FALLBACK_SLACK_TAG,
+    record_stt_failure,
+    send_templated_alert,
+)
+from app.services.service_health import service_health_monitor
-from app.ai.voice.agents.breeze_buddy.stt.fallback import (
-    ALERT_STT_TERMINAL_FAILURE,
-    STT_FALLBACK_SLACK_TAG,
-    record_stt_failure,
-    send_templated_alert,
-)
-from app.services.service_health import service_health_monitor
+from app.ai.voice.agents.breeze_buddy.stt.fallback import record_stt_failure
-from app.ai.voice.agents.breeze_buddy.stt.fallback import (
-    ALERT_STT_TERMINAL_FAILURE,
-    STT_FALLBACK_SLACK_TAG,
-    record_stt_failure,
-    send_templated_alert,
-)
-from app.services.service_health import service_health_monitor
+from app.ai.voice.agents.breeze_buddy.stt.fallback import record_stt_failure
 from app.ai.voice.agents.breeze_buddy.template import TemplateContext
 from app.ai.voice.agents.breeze_buddy.template.builder import FlowConfigBuilder
 from app.ai.voice.agents.breeze_buddy.template.context import with_context
@@ -603,6 +610,42 @@ async def on_pipeline_error(task, error):
                     {"processor": str(processor), "error": error_msg},
                 )
 
+            # Detect STT errors by processor name keywords
+            processor_str = str(processor).lower()
+            stt_keywords = (
+                "stt",
+                "soniox",
+                "deepgram",
+                "transcri",
+                "google",
+                "sarvam",
+            )
+            is_stt_error = any(kw in processor_str for kw in stt_keywords)
-            # Detect STT errors by processor name keywords
-            processor_str = str(processor).lower()
-            stt_keywords = (
-                "stt",
-                "soniox",
-                "deepgram",
-                "transcri",
-                "google",
-                "sarvam",
-            )
-            is_stt_error = any(kw in processor_str for kw in stt_keywords)
+            # Detect STT errors using STT-specific processor identifiers only.
+            # Avoid broad provider substrings like "google", which can also match
+            # non-STT processors such as Google-backed LLM components.
+            processor_str = str(processor).lower()
+            processor_type_str = processor.__class__.__name__.lower()
+            stt_keywords = (
+                "stt",
+                "soniox",
+                "deepgram",
+                "transcri",
+                "sarvam",
+                "speech",
+            )
+            is_stt_error = any(
+                kw in processor_str or kw in processor_type_str
+                for kw in stt_keywords
+            )
-            # Detect STT errors by processor name keywords
-            processor_str = str(processor).lower()
-            stt_keywords = (
-                "stt",
-                "soniox",
-                "deepgram",
-                "transcri",
-                "google",
-                "sarvam",
-            )
-            is_stt_error = any(kw in processor_str for kw in stt_keywords)
+            # Detect STT errors using STT-specific processor identifiers only.
+            # Avoid broad provider substrings like "google", which can also match
+            # non-STT processors such as Google-backed LLM components.
+            processor_str = str(processor).lower()
+            processor_type_str = processor.__class__.__name__.lower()
+            stt_keywords = (
+                "stt",
+                "soniox",
+                "deepgram",
+                "transcri",
+                "sarvam",
+                "speech",
+            )
+            is_stt_error = any(
+                kw in processor_str or kw in processor_type_str
+                for kw in stt_keywords
+            )
+
+            if not is_stt_error:
-            if not is_stt_error:
+            if not is_stt_error:
+                try:
+                    service_health_monitor.record_pipeline_error(
+                        processor=str(processor),
+                        error_message=str(error_msg),
+                    )
+                except Exception as health_err:
+                    logger.warning(
+                        f"Failed to record non-STT pipeline error in service health monitor: {health_err}"
+                    )
-            if not is_stt_error:
+            if not is_stt_error:
+                try:
+                    service_health_monitor.record_pipeline_error(
+                        processor=str(processor),
+                        error_message=str(error_msg),
+                    )
+                except Exception as health_err:
+                    logger.warning(
+                        f"Failed to record non-STT pipeline error in service health monitor: {health_err}"
+                    )
+                return
+
+            logger.warning(f"STT error detected from processor: {processor}")
+
+            # Record failure in fallback system (once per call, Soniox only)
+            if self.stt_provider == "soniox" and not self._stt_failure_recorded:
+                self._stt_failure_recorded = True
+                try:
+                    await record_stt_failure(
+                        error_msg=str(error_msg)[:200],
+                        call_sid=self.call_sid or "",
+                        context="mid-call",
+                    )
+                except Exception as fb_err:
+                    logger.warning(f"STT fallback record_failure failed: {fb_err}")
+
+            # Alert and end call — no mid-call swap in Phase 1
+            fire_and_forget(self._send_mid_call_stt_alert())
+            try:
+                await task.queue_frames([EndFrame()])
+            except Exception:
+                pass
+
         @self.transport.event_handler("on_client_connected")
         async def on_client_connected(transport, client):
             logger.info(f"Client connected: {client}")

diff --git a/app/ai/voice/agents/breeze_buddy/managers/calls.py b/app/ai/voice/agents/breeze_buddy/managers/calls.py
@@ -69,6 +69,7 @@
 )
 from app.services.gcp.storage.storage import upload_file_to_gcs
 from app.services.redis.client import get_redis_service
+from app.services.service_health import service_health_monitor
 
 
 async def _get_lead_config(lead: LeadCallTracker) -> Optional[CallExecutionConfig]:
@@ -456,6 +457,14 @@ async def process_backlog_leads():
                     await release_lock_on_lead_by_id(locked_lead.id)
                     continue
 
+                # Check global service health pause (circuit breaker pattern)
+                if await service_health_monitor.is_globally_paused():
+                    logger.info(
+                        f"Skipping lead {locked_lead.id} - calls are globally paused due to service health"
+                    )
+                    await release_lock_on_lead_by_id(locked_lead.id)
+                    continue
+
                 customer_phone = (locked_lead.payload or {}).get(
                     "customer_mobile_number"
                 )

diff --git a/app/core/config/dynamic.py b/app/core/config/dynamic.py
@@ -341,3 +341,22 @@ async def OUTBOUND_RATE_LIMIT_WINDOW_SECONDS() -> int:
 async def OUTBOUND_RATE_LIMIT_BLOCK_ENABLED() -> bool:
     """Returns OUTBOUND_RATE_LIMIT_BLOCK_ENABLED from Redis"""
     return await get_config("OUTBOUND_RATE_LIMIT_BLOCK_ENABLED", False, bool)
+
+
+# --- Service Health Monitoring Configuration ---
+async def ENABLE_SERVICE_HEALTH_MONITORING() -> bool:
+    """Returns ENABLE_SERVICE_HEALTH_MONITORING from Redis.
+
+    When True, service health monitoring is active and will auto-pause
+    calls when upstream service failures exceed thresholds.
+    """
+    return await get_config("ENABLE_SERVICE_HEALTH_MONITORING", True, bool)
-    When True, service health monitoring is active and will auto-pause
-    calls when upstream service failures exceed thresholds.
-    """
-    return await get_config("ENABLE_SERVICE_HEALTH_MONITORING", True, bool)
+    When False (default), service health monitoring is disabled unless
+    explicitly enabled via Redis/DevCycle rollout.
+    When True, service health monitoring is active and will auto-pause
+    calls when upstream service failures exceed thresholds.
+    """
+    return await get_config("ENABLE_SERVICE_HEALTH_MONITORING", False, bool)
-    When True, service health monitoring is active and will auto-pause
-    calls when upstream service failures exceed thresholds.
-    """
-    return await get_config("ENABLE_SERVICE_HEALTH_MONITORING", True, bool)
+    When False (default), service health monitoring is disabled unless
+    explicitly enabled via Redis/DevCycle rollout.
+    When True, service health monitoring is active and will auto-pause
+    calls when upstream service failures exceed thresholds.
+    """
+    return await get_config("ENABLE_SERVICE_HEALTH_MONITORING", False, bool)
+
+
+async def SERVICE_HEALTH_AUTO_RESUME_MINUTES() -> int:
+    """Returns SERVICE_HEALTH_AUTO_RESUME_MINUTES from Redis.
+
+    Number of minutes with no errors before auto-resuming calls
+    after a circuit breaker opens.
+    """
+    return await get_config("SERVICE_HEALTH_AUTO_RESUME_MINUTES", 15, int)
diff --git a/app/main.py b/app/main.py
@@ -65,6 +65,8 @@
 from app.schemas import (
     AutomaticVoiceUserConnectRequest,
 )
+from app.services.fallback import initialize_fallback_tasks
+from app.services.service_health import initialize_service_health_tasks
 from app.services.langfuse.tasks.task import initialize_langfuse_tasks
 from app.services.redis import (
     close_redis_connections,
@@ -167,6 +169,9 @@ async def lifespan(_app: FastAPI):
             # Initialize Langfuse tasks (if configured)
             await initialize_langfuse_tasks(_background_scheduler)
 
+            # Initialize STT fallback reset tasks
+            await initialize_fallback_tasks(_background_scheduler)
+
-
+
+            # Initialize service health monitoring tasks
+            await initialize_service_health_tasks(_background_scheduler)
-
+
+            # Initialize service health monitoring tasks
+            await initialize_service_health_tasks(_background_scheduler)
             ### Register new tasks here
-            # Initialize STT fallback reset tasks
-            await initialize_fallback_tasks(_background_scheduler)
-
-            ### Register new tasks here
+            # Initialize STT fallback reset tasks
+            await initialize_fallback_tasks(_background_scheduler)
+
+            # Initialize service health check tasks
+            await initialize_service_health_tasks(_background_scheduler)
+
+            ### Register new tasks here
-            # Initialize STT fallback reset tasks
-            await initialize_fallback_tasks(_background_scheduler)
-
-            ### Register new tasks here
+            # Initialize STT fallback reset tasks
+            await initialize_fallback_tasks(_background_scheduler)
+
+            # Initialize service health check tasks
+            await initialize_service_health_tasks(_background_scheduler)
+
+            ### Register new tasks here
 
             # Start the scheduler only if tasks are registered