From 0fd5a1acd3e926af2fe5ef858f73e2338f341ef1 Mon Sep 17 00:00:00 2001 From: Aleksandr Nasstrom Date: Tue, 18 Nov 2025 16:08:49 +0100 Subject: [PATCH 1/2] Improve language detection: add segment-level probability aggregation, weighted scoring, early stopping, and more robust handling of noisy/mixed audio. --- .../WhisperLive/whisper_live/transcriber.py | 106 ++++++++++++++---- 1 file changed, 85 insertions(+), 21 deletions(-) diff --git a/services/WhisperLive/whisper_live/transcriber.py b/services/WhisperLive/whisper_live/transcriber.py index b598360c7..8348a7cde 100644 --- a/services/WhisperLive/whisper_live/transcriber.py +++ b/services/WhisperLive/whisper_live/transcriber.py @@ -1748,6 +1748,11 @@ def detect_language( ) -> Tuple[str, float, List[Tuple[str, float]]]: """ Use Whisper to detect the language of the input audio or features. + + Improved algorithm that aggregates probabilities across all segments and uses + weighted scoring based on both average probability and consistency (number of + segments where the language was detected). This provides more robust language + detection, especially for noisy audio or mixed-language content. Arguments: audio: Input audio signal, must be a 1D float array sampled at 16khz. @@ -1758,14 +1763,19 @@ def detect_language( without speech. This step is using the Silero VAD model. vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available parameters and default values in the class `VadOptions`). - language_detection_threshold: If the maximum probability of the language tokens is - higher than this value, the language is detected. - language_detection_segments: Number of segments to consider for the language detection. + language_detection_threshold: Threshold for early stopping. If the average probability + of the top language across processed segments exceeds this value (after at least + 2 segments), detection stops early. After 3+ segments, a slightly lower threshold + (threshold - 0.1, min 0.4) is used for early stopping. + language_detection_segments: Maximum number of segments to consider for the language + detection. The algorithm may stop earlier if high confidence is achieved. Returns: - language: Detected language. - languege_probability: Probability of the detected language. - all_language_probs: List of tuples with all language names and probabilities. + language: Detected language code (e.g., 'en', 'ru', 'es'). + language_probability: Average probability of the detected language across all segments + where it was detected, or weighted score if multiple segments were processed. + all_language_probs: List of tuples with all language names and probabilities from + the last processed segment. """ assert ( audio is not None or features is not None @@ -1786,7 +1796,15 @@ def detect_language( ..., : language_detection_segments * self.feature_extractor.nb_max_frames ] - detected_language_info = {} + # Aggregate language probabilities across all segments + # Key: language code, Value: list of probabilities from all segments + language_prob_aggregator = {} + # Store all language probabilities from the last segment for return value + all_language_probs = None + segments_processed = 0 + language = None + language_probability = None + for i in range(0, features.shape[-1], self.feature_extractor.nb_max_frames): encoder_output = self.encode( pad_or_trim(features[..., i : i + self.feature_extractor.nb_max_frames]) @@ -1795,20 +1813,66 @@ def detect_language( results = self.model.detect_language(encoder_output)[0] # Parse language names to strip out markers - all_language_probs = [(token[2:-2], prob) for (token, prob) in results] - # Get top language token and probability - language, language_probability = all_language_probs[0] - if language_probability > language_detection_threshold: - break - detected_language_info.setdefault(language, []).append(language_probability) - else: - # If no language detected for all segments, the majority vote of the highest - # projected languages for all segments is used to determine the language. - language = max( - detected_language_info, - key=lambda lang: len(detected_language_info[lang]), - ) - language_probability = max(detected_language_info[language]) + segment_language_probs = [(token[2:-2], prob) for (token, prob) in results] + all_language_probs = segment_language_probs + + # Aggregate probabilities for all languages in this segment + for lang, prob in segment_language_probs: + if lang not in language_prob_aggregator: + language_prob_aggregator[lang] = [] + language_prob_aggregator[lang].append(prob) + + segments_processed += 1 + + # Calculate weighted average probability for top language so far + if language_prob_aggregator: + # Calculate average probability for each language + lang_avg_probs = { + lang: sum(probs) / len(probs) + for lang, probs in language_prob_aggregator.items() + } + # Get the language with highest average probability + top_lang = max(lang_avg_probs, key=lang_avg_probs.get) + top_lang_avg_prob = lang_avg_probs[top_lang] + + # Early stopping: if we have high confidence and enough segments + # Use a slightly lower threshold for early stopping with multiple segments + early_stop_threshold = language_detection_threshold + if segments_processed >= 3: + # After 3 segments, allow early stop with slightly lower threshold + early_stop_threshold = max(0.4, language_detection_threshold - 0.1) + + if top_lang_avg_prob > early_stop_threshold and segments_processed >= 2: + # Check if top language is consistently detected + top_lang_count = len(language_prob_aggregator[top_lang]) + if top_lang_count >= 2 and top_lang_avg_prob > early_stop_threshold: + language = top_lang + language_probability = top_lang_avg_prob + break + + # If we didn't break early, determine language from aggregated results + if language is None: + if not language_prob_aggregator: + # Fallback: use the top language from the last segment + if all_language_probs: + language, language_probability = all_language_probs[0] + else: + language, language_probability = "en", 0.0 + else: + # Calculate weighted average probability for each language + # Weight by both average probability and consistency (number of detections) + lang_scores = {} + for lang, probs in language_prob_aggregator.items(): + avg_prob = sum(probs) / len(probs) + # Combine average probability with consistency weight + # More segments with this language = higher confidence + consistency_weight = min(1.0, len(probs) / 3.0) # Normalize to max 3 segments + lang_scores[lang] = avg_prob * (0.7 + 0.3 * consistency_weight) + + # Select language with highest combined score + language = max(lang_scores, key=lang_scores.get) + # Return the average probability for the selected language + language_probability = sum(language_prob_aggregator[language]) / len(language_prob_aggregator[language]) return language, language_probability, all_language_probs From 5cb3c914aaa862b2f8840fbb060ee2fff686143b Mon Sep 17 00:00:00 2001 From: Aleksandr Nasstrom Date: Wed, 19 Nov 2025 17:12:08 +0100 Subject: [PATCH 2/2] silent improvements --- .../WhisperLive/whisper_live/transcriber.py | 97 +++++++++++++++++-- 1 file changed, 91 insertions(+), 6 deletions(-) diff --git a/services/WhisperLive/whisper_live/transcriber.py b/services/WhisperLive/whisper_live/transcriber.py index 8348a7cde..1ad12c4a9 100644 --- a/services/WhisperLive/whisper_live/transcriber.py +++ b/services/WhisperLive/whisper_live/transcriber.py @@ -1753,6 +1753,18 @@ def detect_language( weighted scoring based on both average probability and consistency (number of segments where the language was detected). This provides more robust language detection, especially for noisy audio or mixed-language content. + + The algorithm filters out segments with low confidence (likely noise or silence) + to prevent incorrect language detection (e.g., "nn" for background noise). + Segments are filtered if: + - Maximum language probability < 0.4 (likely noise/silence) + - Top language probability < 0.35 (very low confidence) + - Top language probability is too close to second place (< 0.15 difference) + and top probability < 0.5 (uncertain detection) + + Final validation: if the aggregated language probability < 0.6, the result is + rejected and "en" is returned with probability 0.0, preventing false positives + from silence/noise. Arguments: audio: Input audio signal, must be a 1D float array sampled at 16khz. @@ -1771,9 +1783,14 @@ def detect_language( detection. The algorithm may stop earlier if high confidence is achieved. Returns: - language: Detected language code (e.g., 'en', 'ru', 'es'). + language: Detected language code (e.g., 'en', 'ru', 'es'). Returns 'en' with + probability 0.0 if no confident detection could be made (all segments filtered + as noise/silence, or final confidence < 0.6). language_probability: Average probability of the detected language across all segments where it was detected, or weighted score if multiple segments were processed. + Returns 0.0 if detection confidence is too low (< 0.6, likely noise/silence). + Note: The calling code (set_language) requires probability > 0.5 to actually + set the language, so returning 0.0 prevents false language detection. all_language_probs: List of tuples with all language names and probabilities from the last processed segment. """ @@ -1805,6 +1822,11 @@ def detect_language( language = None language_probability = None + # Minimum confidence threshold for a segment to be considered valid for language detection + # Segments with max probability below this are likely noise/silence and should be skipped + # Increased from 0.25 to 0.4 to better filter out silence/noise + min_segment_confidence = 0.4 + for i in range(0, features.shape[-1], self.feature_extractor.nb_max_frames): encoder_output = self.encode( pad_or_trim(features[..., i : i + self.feature_extractor.nb_max_frames]) @@ -1816,11 +1838,46 @@ def detect_language( segment_language_probs = [(token[2:-2], prob) for (token, prob) in results] all_language_probs = segment_language_probs + # Filter out segments with low confidence (likely noise/silence) + if not segment_language_probs: + continue + + # Get the maximum probability in this segment + max_prob = max(prob for _, prob in segment_language_probs) + + # Skip segments with very low confidence - these are likely noise or silence + # Also check if probabilities are too evenly distributed (high entropy = uncertain) + if max_prob < min_segment_confidence: + # This segment is likely noise/silence, skip it + if self.logger: + self.logger.debug( + f"Skipping segment with low confidence (max_prob={max_prob:.3f} < {min_segment_confidence})" + ) + continue + + # Additional check: if top language probability is too close to second place, + # it might indicate uncertainty/noise. Require at least 0.15 difference for confidence. + # Also check if top probability is below a reasonable threshold even if it passed min_segment_confidence + if len(segment_language_probs) >= 2: + top_prob = segment_language_probs[0][1] + second_prob = segment_language_probs[1][1] + prob_diff = top_prob - second_prob + # Skip if: (uncertainty AND low confidence) OR (very low top probability) + if (prob_diff < 0.15 and top_prob < 0.5) or top_prob < 0.35: + # Too uncertain or too low confidence, likely noise/silence + if self.logger: + self.logger.debug( + f"Skipping uncertain/low-confidence segment (top_prob={top_prob:.3f}, diff={prob_diff:.3f})" + ) + continue + # Aggregate probabilities for all languages in this segment + # Only include languages with reasonable probability (>= 0.1) to avoid noise for lang, prob in segment_language_probs: - if lang not in language_prob_aggregator: - language_prob_aggregator[lang] = [] - language_prob_aggregator[lang].append(prob) + if prob >= 0.1: # Filter out very low probability languages + if lang not in language_prob_aggregator: + language_prob_aggregator[lang] = [] + language_prob_aggregator[lang].append(prob) segments_processed += 1 @@ -1853,10 +1910,24 @@ def detect_language( # If we didn't break early, determine language from aggregated results if language is None: if not language_prob_aggregator: - # Fallback: use the top language from the last segment + # All segments were filtered out (likely all noise/silence) + # Try to use the last segment with relaxed criteria, but only if we have some confidence if all_language_probs: - language, language_probability = all_language_probs[0] + top_lang, top_prob = all_language_probs[0] + # Only use if probability is at least reasonable (>= 0.6) to avoid false positives + if top_prob >= 0.6: + language, language_probability = top_lang, top_prob + else: + # Very low confidence, return "en" as fallback with low probability + # This allows transcription to proceed, but the system knows it's uncertain + if self.logger: + self.logger.info( + f"All segments filtered out, last segment has low confidence ({top_prob:.3f} < 0.6). " + f"Returning 'en' with probability 0.0" + ) + language, language_probability = "en", 0.0 else: + # No language probabilities available, use "en" as fallback language, language_probability = "en", 0.0 else: # Calculate weighted average probability for each language @@ -1873,6 +1944,20 @@ def detect_language( language = max(lang_scores, key=lang_scores.get) # Return the average probability for the selected language language_probability = sum(language_prob_aggregator[language]) / len(language_prob_aggregator[language]) + + # Final validation: if confidence is too low, don't trust the result + # This prevents returning incorrect languages like "nn" for noise + # Increased threshold to 0.6 to prevent false positives from silence/noise + if language_probability < 0.6: + # Confidence too low, likely noise/silence - return "en" as fallback with low probability + # This allows transcription to proceed, but the system knows it's uncertain + # and won't set the language (set_language requires > 0.5) + if self.logger: + self.logger.info( + f"Language detection confidence too low ({language_probability:.3f} < 0.6), " + f"likely noise/silence. Returning 'en' with probability 0.0" + ) + language, language_probability = "en", 0.0 return language, language_probability, all_language_probs