From 0fd5a1acd3e926af2fe5ef858f73e2338f341ef1 Mon Sep 17 00:00:00 2001
From: Aleksandr Nasstrom <a.nasstrom@symfa.com>
Date: Tue, 18 Nov 2025 16:08:49 +0100
Subject: [PATCH 1/2] Improve language detection: add segment-level probability
 aggregation, weighted scoring, early stopping, and more robust handling of
 noisy/mixed audio.

---
 .../WhisperLive/whisper_live/transcriber.py   | 106 ++++++++++++++----
 1 file changed, 85 insertions(+), 21 deletions(-)

diff --git a/services/WhisperLive/whisper_live/transcriber.py b/services/WhisperLive/whisper_live/transcriber.py
index b598360c7..8348a7cde 100644
--- a/services/WhisperLive/whisper_live/transcriber.py
+++ b/services/WhisperLive/whisper_live/transcriber.py
@@ -1748,6 +1748,11 @@ def detect_language(
     ) -> Tuple[str, float, List[Tuple[str, float]]]:
         """
         Use Whisper to detect the language of the input audio or features.
+        
+        Improved algorithm that aggregates probabilities across all segments and uses
+        weighted scoring based on both average probability and consistency (number of
+        segments where the language was detected). This provides more robust language
+        detection, especially for noisy audio or mixed-language content.
 
         Arguments:
             audio: Input audio signal, must be a 1D float array sampled at 16khz.
@@ -1758,14 +1763,19 @@ def detect_language(
                 without speech. This step is using the Silero VAD model.
             vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available
                 parameters and default values in the class `VadOptions`).
-            language_detection_threshold: If the maximum probability of the language tokens is
-                higher than this value, the language is detected.
-            language_detection_segments: Number of segments to consider for the language detection.
+            language_detection_threshold: Threshold for early stopping. If the average probability
+                of the top language across processed segments exceeds this value (after at least
+                2 segments), detection stops early. After 3+ segments, a slightly lower threshold
+                (threshold - 0.1, min 0.4) is used for early stopping.
+            language_detection_segments: Maximum number of segments to consider for the language
+                detection. The algorithm may stop earlier if high confidence is achieved.
 
         Returns:
-            language: Detected language.
-            languege_probability: Probability of the detected language.
-            all_language_probs: List of tuples with all language names and probabilities.
+            language: Detected language code (e.g., 'en', 'ru', 'es').
+            language_probability: Average probability of the detected language across all segments
+                where it was detected, or weighted score if multiple segments were processed.
+            all_language_probs: List of tuples with all language names and probabilities from
+                the last processed segment.
         """
         assert (
             audio is not None or features is not None
@@ -1786,7 +1796,15 @@ def detect_language(
             ..., : language_detection_segments * self.feature_extractor.nb_max_frames
         ]
 
-        detected_language_info = {}
+        # Aggregate language probabilities across all segments
+        # Key: language code, Value: list of probabilities from all segments
+        language_prob_aggregator = {}
+        # Store all language probabilities from the last segment for return value
+        all_language_probs = None
+        segments_processed = 0
+        language = None
+        language_probability = None
+        
         for i in range(0, features.shape[-1], self.feature_extractor.nb_max_frames):
             encoder_output = self.encode(
                 pad_or_trim(features[..., i : i + self.feature_extractor.nb_max_frames])
@@ -1795,20 +1813,66 @@ def detect_language(
             results = self.model.detect_language(encoder_output)[0]
 
             # Parse language names to strip out markers
-            all_language_probs = [(token[2:-2], prob) for (token, prob) in results]
-            # Get top language token and probability
-            language, language_probability = all_language_probs[0]
-            if language_probability > language_detection_threshold:
-                break
-            detected_language_info.setdefault(language, []).append(language_probability)
-        else:
-            # If no language detected for all segments, the majority vote of the highest
-            # projected languages for all segments is used to determine the language.
-            language = max(
-                detected_language_info,
-                key=lambda lang: len(detected_language_info[lang]),
-            )
-            language_probability = max(detected_language_info[language])
+            segment_language_probs = [(token[2:-2], prob) for (token, prob) in results]
+            all_language_probs = segment_language_probs
+            
+            # Aggregate probabilities for all languages in this segment
+            for lang, prob in segment_language_probs:
+                if lang not in language_prob_aggregator:
+                    language_prob_aggregator[lang] = []
+                language_prob_aggregator[lang].append(prob)
+            
+            segments_processed += 1
+            
+            # Calculate weighted average probability for top language so far
+            if language_prob_aggregator:
+                # Calculate average probability for each language
+                lang_avg_probs = {
+                    lang: sum(probs) / len(probs) 
+                    for lang, probs in language_prob_aggregator.items()
+                }
+                # Get the language with highest average probability
+                top_lang = max(lang_avg_probs, key=lang_avg_probs.get)
+                top_lang_avg_prob = lang_avg_probs[top_lang]
+                
+                # Early stopping: if we have high confidence and enough segments
+                # Use a slightly lower threshold for early stopping with multiple segments
+                early_stop_threshold = language_detection_threshold
+                if segments_processed >= 3:
+                    # After 3 segments, allow early stop with slightly lower threshold
+                    early_stop_threshold = max(0.4, language_detection_threshold - 0.1)
+                
+                if top_lang_avg_prob > early_stop_threshold and segments_processed >= 2:
+                    # Check if top language is consistently detected
+                    top_lang_count = len(language_prob_aggregator[top_lang])
+                    if top_lang_count >= 2 and top_lang_avg_prob > early_stop_threshold:
+                        language = top_lang
+                        language_probability = top_lang_avg_prob
+                        break
+        
+        # If we didn't break early, determine language from aggregated results
+        if language is None:
+            if not language_prob_aggregator:
+                # Fallback: use the top language from the last segment
+                if all_language_probs:
+                    language, language_probability = all_language_probs[0]
+                else:
+                    language, language_probability = "en", 0.0
+            else:
+                # Calculate weighted average probability for each language
+                # Weight by both average probability and consistency (number of detections)
+                lang_scores = {}
+                for lang, probs in language_prob_aggregator.items():
+                    avg_prob = sum(probs) / len(probs)
+                    # Combine average probability with consistency weight
+                    # More segments with this language = higher confidence
+                    consistency_weight = min(1.0, len(probs) / 3.0)  # Normalize to max 3 segments
+                    lang_scores[lang] = avg_prob * (0.7 + 0.3 * consistency_weight)
+                
+                # Select language with highest combined score
+                language = max(lang_scores, key=lang_scores.get)
+                # Return the average probability for the selected language
+                language_probability = sum(language_prob_aggregator[language]) / len(language_prob_aggregator[language])
 
         return language, language_probability, all_language_probs
 

From 5cb3c914aaa862b2f8840fbb060ee2fff686143b Mon Sep 17 00:00:00 2001
From: Aleksandr Nasstrom <a.nasstrom@symfa.com>
Date: Wed, 19 Nov 2025 17:12:08 +0100
Subject: [PATCH 2/2] silent improvements

---
 .../WhisperLive/whisper_live/transcriber.py   | 97 +++++++++++++++++--
 1 file changed, 91 insertions(+), 6 deletions(-)

diff --git a/services/WhisperLive/whisper_live/transcriber.py b/services/WhisperLive/whisper_live/transcriber.py
index 8348a7cde..1ad12c4a9 100644
--- a/services/WhisperLive/whisper_live/transcriber.py
+++ b/services/WhisperLive/whisper_live/transcriber.py
@@ -1753,6 +1753,18 @@ def detect_language(
         weighted scoring based on both average probability and consistency (number of
         segments where the language was detected). This provides more robust language
         detection, especially for noisy audio or mixed-language content.
+        
+        The algorithm filters out segments with low confidence (likely noise or silence)
+        to prevent incorrect language detection (e.g., "nn" for background noise).
+        Segments are filtered if:
+        - Maximum language probability < 0.4 (likely noise/silence)
+        - Top language probability < 0.35 (very low confidence)
+        - Top language probability is too close to second place (< 0.15 difference) 
+          and top probability < 0.5 (uncertain detection)
+        
+        Final validation: if the aggregated language probability < 0.6, the result is
+        rejected and "en" is returned with probability 0.0, preventing false positives
+        from silence/noise.
 
         Arguments:
             audio: Input audio signal, must be a 1D float array sampled at 16khz.
@@ -1771,9 +1783,14 @@ def detect_language(
                 detection. The algorithm may stop earlier if high confidence is achieved.
 
         Returns:
-            language: Detected language code (e.g., 'en', 'ru', 'es').
+            language: Detected language code (e.g., 'en', 'ru', 'es'). Returns 'en' with
+                probability 0.0 if no confident detection could be made (all segments filtered
+                as noise/silence, or final confidence < 0.6).
             language_probability: Average probability of the detected language across all segments
                 where it was detected, or weighted score if multiple segments were processed.
+                Returns 0.0 if detection confidence is too low (< 0.6, likely noise/silence).
+                Note: The calling code (set_language) requires probability > 0.5 to actually
+                set the language, so returning 0.0 prevents false language detection.
             all_language_probs: List of tuples with all language names and probabilities from
                 the last processed segment.
         """
@@ -1805,6 +1822,11 @@ def detect_language(
         language = None
         language_probability = None
         
+        # Minimum confidence threshold for a segment to be considered valid for language detection
+        # Segments with max probability below this are likely noise/silence and should be skipped
+        # Increased from 0.25 to 0.4 to better filter out silence/noise
+        min_segment_confidence = 0.4
+        
         for i in range(0, features.shape[-1], self.feature_extractor.nb_max_frames):
             encoder_output = self.encode(
                 pad_or_trim(features[..., i : i + self.feature_extractor.nb_max_frames])
@@ -1816,11 +1838,46 @@ def detect_language(
             segment_language_probs = [(token[2:-2], prob) for (token, prob) in results]
             all_language_probs = segment_language_probs
             
+            # Filter out segments with low confidence (likely noise/silence)
+            if not segment_language_probs:
+                continue
+            
+            # Get the maximum probability in this segment
+            max_prob = max(prob for _, prob in segment_language_probs)
+            
+            # Skip segments with very low confidence - these are likely noise or silence
+            # Also check if probabilities are too evenly distributed (high entropy = uncertain)
+            if max_prob < min_segment_confidence:
+                # This segment is likely noise/silence, skip it
+                if self.logger:
+                    self.logger.debug(
+                        f"Skipping segment with low confidence (max_prob={max_prob:.3f} < {min_segment_confidence})"
+                    )
+                continue
+            
+            # Additional check: if top language probability is too close to second place,
+            # it might indicate uncertainty/noise. Require at least 0.15 difference for confidence.
+            # Also check if top probability is below a reasonable threshold even if it passed min_segment_confidence
+            if len(segment_language_probs) >= 2:
+                top_prob = segment_language_probs[0][1]
+                second_prob = segment_language_probs[1][1]
+                prob_diff = top_prob - second_prob
+                # Skip if: (uncertainty AND low confidence) OR (very low top probability)
+                if (prob_diff < 0.15 and top_prob < 0.5) or top_prob < 0.35:
+                    # Too uncertain or too low confidence, likely noise/silence
+                    if self.logger:
+                        self.logger.debug(
+                            f"Skipping uncertain/low-confidence segment (top_prob={top_prob:.3f}, diff={prob_diff:.3f})"
+                        )
+                    continue
+            
             # Aggregate probabilities for all languages in this segment
+            # Only include languages with reasonable probability (>= 0.1) to avoid noise
             for lang, prob in segment_language_probs:
-                if lang not in language_prob_aggregator:
-                    language_prob_aggregator[lang] = []
-                language_prob_aggregator[lang].append(prob)
+                if prob >= 0.1:  # Filter out very low probability languages
+                    if lang not in language_prob_aggregator:
+                        language_prob_aggregator[lang] = []
+                    language_prob_aggregator[lang].append(prob)
             
             segments_processed += 1
             
@@ -1853,10 +1910,24 @@ def detect_language(
         # If we didn't break early, determine language from aggregated results
         if language is None:
             if not language_prob_aggregator:
-                # Fallback: use the top language from the last segment
+                # All segments were filtered out (likely all noise/silence)
+                # Try to use the last segment with relaxed criteria, but only if we have some confidence
                 if all_language_probs:
-                    language, language_probability = all_language_probs[0]
+                    top_lang, top_prob = all_language_probs[0]
+                    # Only use if probability is at least reasonable (>= 0.6) to avoid false positives
+                    if top_prob >= 0.6:
+                        language, language_probability = top_lang, top_prob
+                    else:
+                        # Very low confidence, return "en" as fallback with low probability
+                        # This allows transcription to proceed, but the system knows it's uncertain
+                        if self.logger:
+                            self.logger.info(
+                                f"All segments filtered out, last segment has low confidence ({top_prob:.3f} < 0.6). "
+                                f"Returning 'en' with probability 0.0"
+                            )
+                        language, language_probability = "en", 0.0
                 else:
+                    # No language probabilities available, use "en" as fallback
                     language, language_probability = "en", 0.0
             else:
                 # Calculate weighted average probability for each language
@@ -1873,6 +1944,20 @@ def detect_language(
                 language = max(lang_scores, key=lang_scores.get)
                 # Return the average probability for the selected language
                 language_probability = sum(language_prob_aggregator[language]) / len(language_prob_aggregator[language])
+                
+                # Final validation: if confidence is too low, don't trust the result
+                # This prevents returning incorrect languages like "nn" for noise
+                # Increased threshold to 0.6 to prevent false positives from silence/noise
+                if language_probability < 0.6:
+                    # Confidence too low, likely noise/silence - return "en" as fallback with low probability
+                    # This allows transcription to proceed, but the system knows it's uncertain
+                    # and won't set the language (set_language requires > 0.5)
+                    if self.logger:
+                        self.logger.info(
+                            f"Language detection confidence too low ({language_probability:.3f} < 0.6), "
+                            f"likely noise/silence. Returning 'en' with probability 0.0"
+                        )
+                    language, language_probability = "en", 0.0
 
         return language, language_probability, all_language_probs