Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 170 additions & 21 deletions services/WhisperLive/whisper_live/transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -1748,6 +1748,23 @@ def detect_language(
) -> Tuple[str, float, List[Tuple[str, float]]]:
"""
Use Whisper to detect the language of the input audio or features.

Improved algorithm that aggregates probabilities across all segments and uses
weighted scoring based on both average probability and consistency (number of
segments where the language was detected). This provides more robust language
detection, especially for noisy audio or mixed-language content.

The algorithm filters out segments with low confidence (likely noise or silence)
to prevent incorrect language detection (e.g., "nn" for background noise).
Segments are filtered if:
- Maximum language probability < 0.4 (likely noise/silence)
- Top language probability < 0.35 (very low confidence)
- Top language probability is too close to second place (< 0.15 difference)
and top probability < 0.5 (uncertain detection)

Final validation: if the aggregated language probability < 0.6, the result is
rejected and "en" is returned with probability 0.0, preventing false positives
from silence/noise.

Arguments:
audio: Input audio signal, must be a 1D float array sampled at 16khz.
Expand All @@ -1758,14 +1775,24 @@ def detect_language(
without speech. This step is using the Silero VAD model.
vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available
parameters and default values in the class `VadOptions`).
language_detection_threshold: If the maximum probability of the language tokens is
higher than this value, the language is detected.
language_detection_segments: Number of segments to consider for the language detection.
language_detection_threshold: Threshold for early stopping. If the average probability
of the top language across processed segments exceeds this value (after at least
2 segments), detection stops early. After 3+ segments, a slightly lower threshold
(threshold - 0.1, min 0.4) is used for early stopping.
language_detection_segments: Maximum number of segments to consider for the language
detection. The algorithm may stop earlier if high confidence is achieved.

Returns:
language: Detected language.
languege_probability: Probability of the detected language.
all_language_probs: List of tuples with all language names and probabilities.
language: Detected language code (e.g., 'en', 'ru', 'es'). Returns 'en' with
probability 0.0 if no confident detection could be made (all segments filtered
as noise/silence, or final confidence < 0.6).
language_probability: Average probability of the detected language across all segments
where it was detected, or weighted score if multiple segments were processed.
Returns 0.0 if detection confidence is too low (< 0.6, likely noise/silence).
Note: The calling code (set_language) requires probability > 0.5 to actually
set the language, so returning 0.0 prevents false language detection.
all_language_probs: List of tuples with all language names and probabilities from
the last processed segment.
"""
assert (
audio is not None or features is not None
Expand All @@ -1786,7 +1813,20 @@ def detect_language(
..., : language_detection_segments * self.feature_extractor.nb_max_frames
]

detected_language_info = {}
# Aggregate language probabilities across all segments
# Key: language code, Value: list of probabilities from all segments
language_prob_aggregator = {}
# Store all language probabilities from the last segment for return value
all_language_probs = None
segments_processed = 0
language = None
language_probability = None

# Minimum confidence threshold for a segment to be considered valid for language detection
# Segments with max probability below this are likely noise/silence and should be skipped
# Increased from 0.25 to 0.4 to better filter out silence/noise
min_segment_confidence = 0.4

for i in range(0, features.shape[-1], self.feature_extractor.nb_max_frames):
encoder_output = self.encode(
pad_or_trim(features[..., i : i + self.feature_extractor.nb_max_frames])
Expand All @@ -1795,20 +1835,129 @@ def detect_language(
results = self.model.detect_language(encoder_output)[0]

# Parse language names to strip out markers
all_language_probs = [(token[2:-2], prob) for (token, prob) in results]
# Get top language token and probability
language, language_probability = all_language_probs[0]
if language_probability > language_detection_threshold:
break
detected_language_info.setdefault(language, []).append(language_probability)
else:
# If no language detected for all segments, the majority vote of the highest
# projected languages for all segments is used to determine the language.
language = max(
detected_language_info,
key=lambda lang: len(detected_language_info[lang]),
)
language_probability = max(detected_language_info[language])
segment_language_probs = [(token[2:-2], prob) for (token, prob) in results]
all_language_probs = segment_language_probs

# Filter out segments with low confidence (likely noise/silence)
if not segment_language_probs:
continue

# Get the maximum probability in this segment
max_prob = max(prob for _, prob in segment_language_probs)

# Skip segments with very low confidence - these are likely noise or silence
# Also check if probabilities are too evenly distributed (high entropy = uncertain)
if max_prob < min_segment_confidence:
# This segment is likely noise/silence, skip it
if self.logger:
self.logger.debug(
f"Skipping segment with low confidence (max_prob={max_prob:.3f} < {min_segment_confidence})"
)
continue

# Additional check: if top language probability is too close to second place,
# it might indicate uncertainty/noise. Require at least 0.15 difference for confidence.
# Also check if top probability is below a reasonable threshold even if it passed min_segment_confidence
if len(segment_language_probs) >= 2:
top_prob = segment_language_probs[0][1]
second_prob = segment_language_probs[1][1]
prob_diff = top_prob - second_prob
# Skip if: (uncertainty AND low confidence) OR (very low top probability)
if (prob_diff < 0.15 and top_prob < 0.5) or top_prob < 0.35:
# Too uncertain or too low confidence, likely noise/silence
if self.logger:
self.logger.debug(
f"Skipping uncertain/low-confidence segment (top_prob={top_prob:.3f}, diff={prob_diff:.3f})"
)
continue

# Aggregate probabilities for all languages in this segment
# Only include languages with reasonable probability (>= 0.1) to avoid noise
for lang, prob in segment_language_probs:
if prob >= 0.1: # Filter out very low probability languages
if lang not in language_prob_aggregator:
language_prob_aggregator[lang] = []
language_prob_aggregator[lang].append(prob)

segments_processed += 1

# Calculate weighted average probability for top language so far
if language_prob_aggregator:
# Calculate average probability for each language
lang_avg_probs = {
lang: sum(probs) / len(probs)
for lang, probs in language_prob_aggregator.items()
}
# Get the language with highest average probability
top_lang = max(lang_avg_probs, key=lang_avg_probs.get)
top_lang_avg_prob = lang_avg_probs[top_lang]

# Early stopping: if we have high confidence and enough segments
# Use a slightly lower threshold for early stopping with multiple segments
early_stop_threshold = language_detection_threshold
if segments_processed >= 3:
# After 3 segments, allow early stop with slightly lower threshold
early_stop_threshold = max(0.4, language_detection_threshold - 0.1)

if top_lang_avg_prob > early_stop_threshold and segments_processed >= 2:
# Check if top language is consistently detected
top_lang_count = len(language_prob_aggregator[top_lang])
if top_lang_count >= 2 and top_lang_avg_prob > early_stop_threshold:
language = top_lang
language_probability = top_lang_avg_prob
break

# If we didn't break early, determine language from aggregated results
if language is None:
if not language_prob_aggregator:
# All segments were filtered out (likely all noise/silence)
# Try to use the last segment with relaxed criteria, but only if we have some confidence
if all_language_probs:
top_lang, top_prob = all_language_probs[0]
# Only use if probability is at least reasonable (>= 0.6) to avoid false positives
if top_prob >= 0.6:
language, language_probability = top_lang, top_prob
else:
# Very low confidence, return "en" as fallback with low probability
# This allows transcription to proceed, but the system knows it's uncertain
if self.logger:
self.logger.info(
f"All segments filtered out, last segment has low confidence ({top_prob:.3f} < 0.6). "
f"Returning 'en' with probability 0.0"
)
language, language_probability = "en", 0.0
else:
# No language probabilities available, use "en" as fallback
language, language_probability = "en", 0.0
else:
# Calculate weighted average probability for each language
# Weight by both average probability and consistency (number of detections)
lang_scores = {}
for lang, probs in language_prob_aggregator.items():
avg_prob = sum(probs) / len(probs)
# Combine average probability with consistency weight
# More segments with this language = higher confidence
consistency_weight = min(1.0, len(probs) / 3.0) # Normalize to max 3 segments
lang_scores[lang] = avg_prob * (0.7 + 0.3 * consistency_weight)

# Select language with highest combined score
language = max(lang_scores, key=lang_scores.get)
# Return the average probability for the selected language
language_probability = sum(language_prob_aggregator[language]) / len(language_prob_aggregator[language])

# Final validation: if confidence is too low, don't trust the result
# This prevents returning incorrect languages like "nn" for noise
# Increased threshold to 0.6 to prevent false positives from silence/noise
if language_probability < 0.6:
# Confidence too low, likely noise/silence - return "en" as fallback with low probability
# This allows transcription to proceed, but the system knows it's uncertain
# and won't set the language (set_language requires > 0.5)
if self.logger:
self.logger.info(
f"Language detection confidence too low ({language_probability:.3f} < 0.6), "
f"likely noise/silence. Returning 'en' with probability 0.0"
)
language, language_probability = "en", 0.0

return language, language_probability, all_language_probs

Expand Down