@@ -89,6 +89,12 @@ impl AudioEncoding {
8989#[ derive( Debug ) ]
9090pub struct ElevenLabsTranscribe ;
9191
92+ #[ derive( Debug , Clone , Copy ) ]
93+ struct ConversationLoopConfig {
94+ input_format : AudioFormat ,
95+ include_language_detection : bool ,
96+ }
97+
9298#[ async_trait]
9399impl Service for ElevenLabsTranscribe {
94100 type Params = Params ;
@@ -101,6 +107,10 @@ impl Service for ElevenLabsTranscribe {
101107 bail ! ( "ElevenLabs realtime currently requires mono input audio" ) ;
102108 }
103109
110+ let include_language_detection = params
111+ . include_language_detection
112+ . unwrap_or ( DEFAULT_INCLUDE_LANGUAGE_DETECTION ) ;
113+
104114 let encoding = resolve_audio_encoding ( input_format) ?;
105115 let endpoint = build_endpoint ( & params, encoding) ?;
106116
@@ -130,7 +140,10 @@ impl Service for ElevenLabsTranscribe {
130140 & mut read,
131141 & outbound_tx,
132142 & mut outbound_closed,
133- input_format,
143+ ConversationLoopConfig {
144+ input_format,
145+ include_language_detection,
146+ } ,
134147 params. previous_text . as_deref ( ) ,
135148 )
136149 . await ;
@@ -154,7 +167,7 @@ async fn run_conversation_loop<R>(
154167 read : & mut R ,
155168 outbound_tx : & mpsc:: UnboundedSender < OutboundMessage > ,
156169 outbound_closed : & mut bool ,
157- input_format : AudioFormat ,
170+ config : ConversationLoopConfig ,
158171 mut previous_text_for_next_chunk : Option < & str > ,
159172) -> Result < ( ) >
160173where
@@ -167,7 +180,7 @@ where
167180 input_event = input. recv( ) , if !input_closed => {
168181 match input_event {
169182 Some ( Input :: Audio { frame } ) => {
170- if frame. format != input_format {
183+ if frame. format != config . input_format {
171184 bail!( "Received mixed input audio formats in conversation" ) ;
172185 }
173186
@@ -190,7 +203,7 @@ where
190203 msg = read. next( ) => {
191204 match msg {
192205 Some ( Ok ( message) ) => {
193- process_server_message( message, output) ?;
206+ process_server_message( message, output, config . include_language_detection ) ?;
194207 }
195208 Some ( Err ( e) ) => {
196209 bail!( "Error reading ElevenLabs websocket: {e}" ) ;
@@ -357,11 +370,15 @@ struct InputAudioChunk<'a> {
357370 previous_text : Option < & ' a str > ,
358371}
359372
360- fn process_server_message ( message : Message , output : & ConversationOutput ) -> Result < ( ) > {
373+ fn process_server_message (
374+ message : Message ,
375+ output : & ConversationOutput ,
376+ include_language_detection : bool ,
377+ ) -> Result < ( ) > {
361378 match message {
362379 Message :: Text ( text) => {
363380 debug ! ( "ElevenLabs websocket received: {}" , text) ;
364- process_server_json ( text. as_str ( ) , output)
381+ process_server_json ( text. as_str ( ) , output, include_language_detection )
365382 }
366383 Message :: Binary ( _) => Ok ( ( ) ) ,
367384 Message :: Ping ( payload) => {
@@ -377,7 +394,11 @@ fn process_server_message(message: Message, output: &ConversationOutput) -> Resu
377394 }
378395}
379396
380- fn process_server_json ( json : & str , output : & ConversationOutput ) -> Result < ( ) > {
397+ fn process_server_json (
398+ json : & str ,
399+ output : & ConversationOutput ,
400+ include_language_detection : bool ,
401+ ) -> Result < ( ) > {
381402 let envelope: RealtimeEnvelope = serde_json:: from_str ( json)
382403 . with_context ( || format ! ( "Parsing ElevenLabs server event: {json}" ) ) ?;
383404
@@ -391,6 +412,10 @@ fn process_server_json(json: &str, output: &ConversationOutput) -> Result<()> {
391412 output. text ( false , event. text , None )
392413 }
393414 "committed_transcript" => {
415+ if include_language_detection {
416+ // Ignoring committed_transcript because include_language_detection=true; expecting committed_transcript_with_timestamps
417+ return Ok ( ( ) ) ;
418+ }
394419 let event: CommittedTranscript = serde_json:: from_value ( envelope. payload ) ?;
395420 output. text ( true , event. text , None )
396421 }
0 commit comments