Vexa-ai · a-nasstrom · Nov 18, 2025 · Nov 19, 2025
diff --git a/services/WhisperLive/whisper_live/transcriber.py b/services/WhisperLive/whisper_live/transcriber.py
@@ -1748,6 +1748,23 @@ def detect_language(
     ) -> Tuple[str, float, List[Tuple[str, float]]]:
         """
         Use Whisper to detect the language of the input audio or features.
+
+        Improved algorithm that aggregates probabilities across all segments and uses
+        weighted scoring based on both average probability and consistency (number of
+        segments where the language was detected). This provides more robust language
+        detection, especially for noisy audio or mixed-language content.
+
+        The algorithm filters out segments with low confidence (likely noise or silence)
+        to prevent incorrect language detection (e.g., "nn" for background noise).
+        Segments are filtered if:
+        - Maximum language probability < 0.4 (likely noise/silence)
+        - Top language probability < 0.35 (very low confidence)
+        - Top language probability is too close to second place (< 0.15 difference) 
+          and top probability < 0.5 (uncertain detection)
+
+        Final validation: if the aggregated language probability < 0.6, the result is
+        rejected and "en" is returned with probability 0.0, preventing false positives
+        from silence/noise.
 
         Arguments:
             audio: Input audio signal, must be a 1D float array sampled at 16khz.
@@ -1758,14 +1775,24 @@ def detect_language(
                 without speech. This step is using the Silero VAD model.
             vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available
                 parameters and default values in the class `VadOptions`).
-            language_detection_threshold: If the maximum probability of the language tokens is
-                higher than this value, the language is detected.
-            language_detection_segments: Number of segments to consider for the language detection.
+            language_detection_threshold: Threshold for early stopping. If the average probability
+                of the top language across processed segments exceeds this value (after at least
+                2 segments), detection stops early. After 3+ segments, a slightly lower threshold
+                (threshold - 0.1, min 0.4) is used for early stopping.
+            language_detection_segments: Maximum number of segments to consider for the language
+                detection. The algorithm may stop earlier if high confidence is achieved.
 
         Returns:
-            language: Detected language.
-            languege_probability: Probability of the detected language.
-            all_language_probs: List of tuples with all language names and probabilities.
+            language: Detected language code (e.g., 'en', 'ru', 'es'). Returns 'en' with
+                probability 0.0 if no confident detection could be made (all segments filtered
+                as noise/silence, or final confidence < 0.6).
+            language_probability: Average probability of the detected language across all segments
+                where it was detected, or weighted score if multiple segments were processed.
+                Returns 0.0 if detection confidence is too low (< 0.6, likely noise/silence).
+                Note: The calling code (set_language) requires probability > 0.5 to actually
+                set the language, so returning 0.0 prevents false language detection.
+            all_language_probs: List of tuples with all language names and probabilities from
+                the last processed segment.
         """
         assert (
             audio is not None or features is not None
@@ -1786,7 +1813,20 @@ def detect_language(
             ..., : language_detection_segments * self.feature_extractor.nb_max_frames
         ]
 
-        detected_language_info = {}
+        # Aggregate language probabilities across all segments
+        # Key: language code, Value: list of probabilities from all segments
+        language_prob_aggregator = {}
+        # Store all language probabilities from the last segment for return value
+        all_language_probs = None
+        segments_processed = 0
+        language = None
+        language_probability = None
+
+        # Minimum confidence threshold for a segment to be considered valid for language detection
+        # Segments with max probability below this are likely noise/silence and should be skipped
+        # Increased from 0.25 to 0.4 to better filter out silence/noise
+        min_segment_confidence = 0.4
+
         for i in range(0, features.shape[-1], self.feature_extractor.nb_max_frames):
             encoder_output = self.encode(
                 pad_or_trim(features[..., i : i + self.feature_extractor.nb_max_frames])
@@ -1795,20 +1835,129 @@ def detect_language(
             results = self.model.detect_language(encoder_output)[0]
 
             # Parse language names to strip out markers
-            all_language_probs = [(token[2:-2], prob) for (token, prob) in results]
-            # Get top language token and probability
-            language, language_probability = all_language_probs[0]
-            if language_probability > language_detection_threshold:
-                break
-            detected_language_info.setdefault(language, []).append(language_probability)
-        else:
-            # If no language detected for all segments, the majority vote of the highest
-            # projected languages for all segments is used to determine the language.
-            language = max(
-                detected_language_info,
-                key=lambda lang: len(detected_language_info[lang]),
-            )
-            language_probability = max(detected_language_info[language])
+            segment_language_probs = [(token[2:-2], prob) for (token, prob) in results]
+            all_language_probs = segment_language_probs
+
+            # Filter out segments with low confidence (likely noise/silence)
+            if not segment_language_probs:
+                continue
+
+            # Get the maximum probability in this segment
+            max_prob = max(prob for _, prob in segment_language_probs)
+
+            # Skip segments with very low confidence - these are likely noise or silence
+            # Also check if probabilities are too evenly distributed (high entropy = uncertain)
+            if max_prob < min_segment_confidence:
+                # This segment is likely noise/silence, skip it
+                if self.logger:
+                    self.logger.debug(
+                        f"Skipping segment with low confidence (max_prob={max_prob:.3f} < {min_segment_confidence})"
+                    )
+                continue
+
+            # Additional check: if top language probability is too close to second place,
+            # it might indicate uncertainty/noise. Require at least 0.15 difference for confidence.
+            # Also check if top probability is below a reasonable threshold even if it passed min_segment_confidence
+            if len(segment_language_probs) >= 2:
+                top_prob = segment_language_probs[0][1]
+                second_prob = segment_language_probs[1][1]
+                prob_diff = top_prob - second_prob
+                # Skip if: (uncertainty AND low confidence) OR (very low top probability)
+                if (prob_diff < 0.15 and top_prob < 0.5) or top_prob < 0.35:
+                    # Too uncertain or too low confidence, likely noise/silence
+                    if self.logger:
+                        self.logger.debug(
+                            f"Skipping uncertain/low-confidence segment (top_prob={top_prob:.3f}, diff={prob_diff:.3f})"
+                        )
+                    continue
+
+            # Aggregate probabilities for all languages in this segment
+            # Only include languages with reasonable probability (>= 0.1) to avoid noise
+            for lang, prob in segment_language_probs:
+                if prob >= 0.1:  # Filter out very low probability languages
+                    if lang not in language_prob_aggregator:
+                        language_prob_aggregator[lang] = []
+                    language_prob_aggregator[lang].append(prob)
+
+            segments_processed += 1
+
+            # Calculate weighted average probability for top language so far
+            if language_prob_aggregator:
+                # Calculate average probability for each language
+                lang_avg_probs = {
+                    lang: sum(probs) / len(probs) 
+                    for lang, probs in language_prob_aggregator.items()
+                }
+                # Get the language with highest average probability
+                top_lang = max(lang_avg_probs, key=lang_avg_probs.get)
+                top_lang_avg_prob = lang_avg_probs[top_lang]
+
+                # Early stopping: if we have high confidence and enough segments
+                # Use a slightly lower threshold for early stopping with multiple segments
+                early_stop_threshold = language_detection_threshold
+                if segments_processed >= 3:
+                    # After 3 segments, allow early stop with slightly lower threshold
+                    early_stop_threshold = max(0.4, language_detection_threshold - 0.1)
+
+                if top_lang_avg_prob > early_stop_threshold and segments_processed >= 2:
+                    # Check if top language is consistently detected
+                    top_lang_count = len(language_prob_aggregator[top_lang])
+                    if top_lang_count >= 2 and top_lang_avg_prob > early_stop_threshold:
+                        language = top_lang
+                        language_probability = top_lang_avg_prob
+                        break
+
+        # If we didn't break early, determine language from aggregated results
+        if language is None:
+            if not language_prob_aggregator:
+                # All segments were filtered out (likely all noise/silence)
+                # Try to use the last segment with relaxed criteria, but only if we have some confidence
+                if all_language_probs:
+                    top_lang, top_prob = all_language_probs[0]
+                    # Only use if probability is at least reasonable (>= 0.6) to avoid false positives
+                    if top_prob >= 0.6:
+                        language, language_probability = top_lang, top_prob
+                    else:
+                        # Very low confidence, return "en" as fallback with low probability
+                        # This allows transcription to proceed, but the system knows it's uncertain
+                        if self.logger:
+                            self.logger.info(
+                                f"All segments filtered out, last segment has low confidence ({top_prob:.3f} < 0.6). "
+                                f"Returning 'en' with probability 0.0"
+                            )
+                        language, language_probability = "en", 0.0
+                else:
+                    # No language probabilities available, use "en" as fallback
+                    language, language_probability = "en", 0.0
+            else:
+                # Calculate weighted average probability for each language
+                # Weight by both average probability and consistency (number of detections)
+                lang_scores = {}
+                for lang, probs in language_prob_aggregator.items():
+                    avg_prob = sum(probs) / len(probs)
+                    # Combine average probability with consistency weight
+                    # More segments with this language = higher confidence
+                    consistency_weight = min(1.0, len(probs) / 3.0)  # Normalize to max 3 segments
+                    lang_scores[lang] = avg_prob * (0.7 + 0.3 * consistency_weight)
+
+                # Select language with highest combined score
+                language = max(lang_scores, key=lang_scores.get)
+                # Return the average probability for the selected language
+                language_probability = sum(language_prob_aggregator[language]) / len(language_prob_aggregator[language])
+
+                # Final validation: if confidence is too low, don't trust the result
+                # This prevents returning incorrect languages like "nn" for noise
+                # Increased threshold to 0.6 to prevent false positives from silence/noise
+                if language_probability < 0.6:
+                    # Confidence too low, likely noise/silence - return "en" as fallback with low probability
+                    # This allows transcription to proceed, but the system knows it's uncertain
+                    # and won't set the language (set_language requires > 0.5)
+                    if self.logger:
+                        self.logger.info(
+                            f"Language detection confidence too low ({language_probability:.3f} < 0.6), "
+                            f"likely noise/silence. Returning 'en' with probability 0.0"
+                        )
+                    language, language_probability = "en", 0.0
 
         return language, language_probability, all_language_probs