Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 47 additions & 6 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -1277,6 +1277,12 @@ def resolve_speaker_identity(speaker_segments, batch_end_ts_ms, audio_duration_s
Considers both closed (speaker_timeline) and still-open (_active_speaker_starts)
intervals, since a speaker who started talking during this batch may not have
emitted speaker_end yet when the transcription thread kicks off.

Multi-voice disambiguation: when two or more distinct pyannote IDs in the
same batch resolve to the SAME caption name (e.g. two people speaking from
one shared Meet account), we suffix each name with "#1", "#2", etc., ordered
chronologically by first appearance in the batch. This preserves pyannote's
split instead of collapsing both voices to a single label.
"""
if batch_end_ts_ms is None or not speaker_segments:
return {}
Expand All @@ -1293,28 +1299,48 @@ def resolve_speaker_identity(speaker_segments, batch_end_ts_ms, audio_duration_s
from collections import defaultdict
batch_start_ms = batch_end_ts_ms - audio_duration_secs * 1000

speaker_times = defaultdict(list)
# Group pyannote segments by their original id, preserving first-appearance
# order (we iterate speaker_segments in chronological order).
speaker_times = {}
first_seen = {}
for seg in speaker_segments:
orig = seg["speaker"]
seg_start_ms = batch_start_ms + seg["start"] * 1000
seg_end_ms = batch_start_ms + seg["end"] * 1000
speaker_times[seg["speaker"]].append((seg_start_ms, seg_end_ms))
if orig not in speaker_times:
speaker_times[orig] = []
first_seen[orig] = seg_start_ms
speaker_times[orig].append((seg_start_ms, seg_end_ms))

resolved = {}
# First pass: majority-vote caption name per pyannote id.
raw_resolved = {} # orig_id -> caption_name
for original_id, time_ranges in speaker_times.items():
name_overlap = defaultdict(float)
for seg_start_ms, seg_end_ms in time_ranges:
for (tl_start, tl_end, name) in intervals:
overlap = max(0.0, min(seg_end_ms, tl_end) - max(seg_start_ms, tl_start))
if overlap > 0:
name_overlap[name] += overlap

if not name_overlap:
continue

best_name = max(name_overlap, key=name_overlap.get)
total_ms = sum(end - start for start, end in time_ranges)
if total_ms > 0 and name_overlap[best_name] / total_ms >= 0.30:
resolved[original_id] = best_name
raw_resolved[original_id] = best_name

# Second pass: disambiguate collisions. If multiple pyannote IDs map to the
# same caption name, assign each a "#N" suffix in chronological order.
by_name = defaultdict(list) # caption_name -> [orig_id...] (first-appearance order)
for orig_id in sorted(raw_resolved, key=lambda oid: first_seen[oid]):
by_name[raw_resolved[orig_id]].append(orig_id)

resolved = {}
for name, ids in by_name.items():
if len(ids) == 1:
resolved[ids[0]] = name
else:
for i, orig_id in enumerate(ids, start=1):
resolved[orig_id] = f"{name} Speaker #{i}"

return resolved

Expand Down Expand Up @@ -1357,6 +1383,9 @@ def partial_transcribe_and_emit(audio_data, utterance_id, speaker_hint, batch_en

speaker = speaker_hint or "Speaker 1"
segment = {"text": full_transcript, "speaker": speaker, "start": 0.0, "end": audio_duration}
# Anchor the batch on wall-clock so the viewer can compute the true
# moment each sentence was spoken (seg.start is seconds from batch start).
batch_anchor_ms = batch_end_ts_ms if batch_end_ts_ms else int(time.time() * 1000)
ws_payload = {
"transcript": full_transcript,
"source_language": source_lang,
Expand All @@ -1365,6 +1394,8 @@ def partial_transcribe_and_emit(audio_data, utterance_id, speaker_hint, batch_en
"segments": [segment],
"utterance_id": utterance_id,
"is_partial": True,
"batch_end_ts_ms": batch_anchor_ms,
"audio_duration_secs": audio_duration,
}
socketio.emit("new_translation", ws_payload, room="admin")
for lang_code, n in active_language_viewers.items():
Expand Down Expand Up @@ -1395,6 +1426,8 @@ def partial_transcribe_and_emit(audio_data, utterance_id, speaker_hint, batch_en
"utterance_id": utterance_id,
"is_partial": True,
"is_update": True,
"batch_end_ts_ms": batch_anchor_ms,
"audio_duration_secs": audio_duration,
}
socketio.emit("new_translation", ws_payload_final, room="admin")
for lang_code in translated_segments_by_lang:
Expand Down Expand Up @@ -1667,6 +1700,10 @@ def normalize_caps(text):
# Check if it's time to generate a summary
check_and_generate_summary()

# Wall-clock anchor for the batch so the viewer can compute each
# segment's true spoken-at time from batch_end_ts_ms + seg.start.
final_batch_anchor_ms = batch_end_ts_ms if batch_end_ts_ms else int(time.time() * 1000)

# IMMEDIATELY send transcription to UI (before translations)
# This makes the UI feel much more responsive
ws_payload_initial = {
Expand All @@ -1678,6 +1715,8 @@ def normalize_caps(text):
"utterance_id": utterance_id,
"is_partial": False,
"is_initial": True, # Flag to indicate translations are coming
"batch_end_ts_ms": final_batch_anchor_ms,
"audio_duration_secs": audio_duration,
}

if Config.DEBUG:
Expand Down Expand Up @@ -1760,6 +1799,8 @@ def translate_segment_for_language(lang_info, segment):
"utterance_id": utterance_id,
"is_partial": False,
"is_update": True, # Flag to indicate this is a translation update
"batch_end_ts_ms": final_batch_anchor_ms,
"audio_duration_secs": audio_duration,
}

if Config.DEBUG:
Expand Down
9 changes: 8 additions & 1 deletion templates/admin.html
Original file line number Diff line number Diff line change
Expand Up @@ -2051,7 +2051,12 @@ <h2>Settings</h2>
? `<div style="font-weight:600; color:#a78bfa; margin-bottom:4px;">${_escHtml(g.speaker)}</div>`
: '';
const items = g.segs.map(seg => {
const absMs = utteranceMs + (seg.start || 0) * 1000;
let absMs;
if (u.batch_end_ts_ms && u.audio_duration_secs != null) {
absMs = u.batch_end_ts_ms - (u.audio_duration_secs - (seg.start || 0)) * 1000;
} else {
absMs = utteranceMs + (seg.start || 0) * 1000;
}
const ss = String(new Date(absMs).getSeconds()).padStart(2, '0');
return `<li style="display:flex; gap:8px; align-items:baseline; margin-bottom:2px; list-style:none;"><span style="color:#6b7280; font-size:11px; font-variant-numeric:tabular-nums; flex-shrink:0; min-width:24px;">:${ss}</span><span>${_escHtml(seg.text)}</span></li>`;
}).join('');
Expand Down Expand Up @@ -2101,6 +2106,8 @@ <h2>Settings</h2>
const entry = {
utterance_id: uttId,
segments: data.segments,
batch_end_ts_ms: data.batch_end_ts_ms || null,
audio_duration_secs: data.audio_duration_secs || null,
is_partial: !!data.is_partial,
timestamp: data.timestamp,
};
Expand Down
19 changes: 17 additions & 2 deletions templates/viewer.html
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,15 @@ <h1>Select Your Language</h1>
group.segs.forEach(seg => {
const li = document.createElement('li');
li.style.cssText = 'display:flex; gap:8px; align-items:baseline; margin-bottom:2px;';
const absMs = utterance.timestamp + (seg.start || 0) * 1000;
// Prefer batch_end_ts_ms anchor from the backend. Seg.start is
// seconds from batch start, so absolute spoken-at time is
// batch_end - (audio_duration - seg.start) * 1000.
let absMs;
if (utterance.batch_end_ts_ms && utterance.audio_duration_secs != null) {
absMs = utterance.batch_end_ts_ms - (utterance.audio_duration_secs - (seg.start || 0)) * 1000;
} else {
absMs = utterance.timestamp + (seg.start || 0) * 1000;
}
const d = new Date(absMs);
const ss = String(d.getSeconds()).padStart(2, '0');
const tsSpan = document.createElement('span');
Expand Down Expand Up @@ -942,13 +950,20 @@ <h1>Select Your Language</h1>

const uttId = data.utterance_id || `anon_${_anonUtteranceCounter++}`;
const segments = data.translated_segments[selectedLanguage].map(s => ({
speaker: s.speaker || null, text: s.text,
speaker: s.speaker || null,
text: s.text,
start: s.start,
end: s.end,
}));
// True wall-clock anchor for the batch. Each segment's spoken-at
// time = batch_end_ts_ms - (audio_duration_secs - seg.start) * 1000.
const entry = {
utterance_id: uttId,
segments: segments,
is_partial: !!data.is_partial,
timestamp: Date.now(),
batch_end_ts_ms: data.batch_end_ts_ms || null,
audio_duration_secs: data.audio_duration_secs || null,
};

const existingIdx = displayedTranslations.findIndex(u => u.utterance_id === uttId);
Expand Down
Loading