From 4ba8b1a63fb9fde68e892de197d87c560fe9553d Mon Sep 17 00:00:00 2001
From: Chabert Etienne <etienne.chabert@gmail.com>
Date: Tue, 21 Apr 2026 17:55:25 +0200
Subject: [PATCH] Multi-voice disambiguation + accurate per-segment spoken-at
 times
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- resolve_speaker_identity() now disambiguates when multiple pyannote
  IDs in one batch resolve to the same caption name (common when two
  people share a single Meet tile or one participant is
  screen-sharing). Distinct voices get "<Name> Speaker #1",
  "<Name> Speaker #2", … ordered chronologically by first appearance.
  One-voice case keeps the plain name.
- Both partial and final WS payloads now carry batch_end_ts_ms +
  audio_duration_secs. The frontend computes each sentence's true
  spoken-at timestamp as batch_end - (audio_duration - seg.start)*1000
  instead of anchoring on "when the viewer received the first partial",
  which had all bullets showing the same :SS for multi-segment
  utterances.
- Viewer new_translation handler now carries start/end through from
  translated segments (they were dropped, making every bullet display
  the utterance base time).
- Admin + viewer renderers use the batch anchor when present and fall
  back to the old formula for legacy payloads.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 app.py                | 53 ++++++++++++++++++++++++++++++++++++++-----
 templates/admin.html  |  9 +++++++-
 templates/viewer.html | 19 ++++++++++++++--
 3 files changed, 72 insertions(+), 9 deletions(-)
diff --git a/app.py b/app.py
index 79e5311..f7c6d7f 100644
--- a/app.py
+++ b/app.py
@@ -1277,6 +1277,12 @@ def resolve_speaker_identity(speaker_segments, batch_end_ts_ms, audio_duration_s
     Considers both closed (speaker_timeline) and still-open (_active_speaker_starts)
     intervals, since a speaker who started talking during this batch may not have
     emitted speaker_end yet when the transcription thread kicks off.
+
+    Multi-voice disambiguation: when two or more distinct pyannote IDs in the
+    same batch resolve to the SAME caption name (e.g. two people speaking from
+    one shared Meet account), we suffix each name with "#1", "#2", etc., ordered
+    chronologically by first appearance in the batch. This preserves pyannote's
+    split instead of collapsing both voices to a single label.
     """
     if batch_end_ts_ms is None or not speaker_segments:
         return {}
@@ -1293,13 +1299,21 @@ def resolve_speaker_identity(speaker_segments, batch_end_ts_ms, audio_duration_s
     from collections import defaultdict
     batch_start_ms = batch_end_ts_ms - audio_duration_secs * 1000
 
-    speaker_times = defaultdict(list)
+    # Group pyannote segments by their original id, preserving first-appearance
+    # order (we iterate speaker_segments in chronological order).
+    speaker_times = {}
+    first_seen = {}
     for seg in speaker_segments:
+        orig = seg["speaker"]
         seg_start_ms = batch_start_ms + seg["start"] * 1000
         seg_end_ms = batch_start_ms + seg["end"] * 1000
-        speaker_times[seg["speaker"]].append((seg_start_ms, seg_end_ms))
+        if orig not in speaker_times:
+            speaker_times[orig] = []
+            first_seen[orig] = seg_start_ms
+        speaker_times[orig].append((seg_start_ms, seg_end_ms))
 
-    resolved = {}
+    # First pass: majority-vote caption name per pyannote id.
+    raw_resolved = {}  # orig_id -> caption_name
     for original_id, time_ranges in speaker_times.items():
         name_overlap = defaultdict(float)
         for seg_start_ms, seg_end_ms in time_ranges:
@@ -1307,14 +1321,26 @@ def resolve_speaker_identity(speaker_segments, batch_end_ts_ms, audio_duration_s
                 overlap = max(0.0, min(seg_end_ms, tl_end) - max(seg_start_ms, tl_start))
                 if overlap > 0:
                     name_overlap[name] += overlap
-
         if not name_overlap:
             continue
-
         best_name = max(name_overlap, key=name_overlap.get)
         total_ms = sum(end - start for start, end in time_ranges)
         if total_ms > 0 and name_overlap[best_name] / total_ms >= 0.30:
-            resolved[original_id] = best_name
+            raw_resolved[original_id] = best_name
+
+    # Second pass: disambiguate collisions. If multiple pyannote IDs map to the
+    # same caption name, assign each a "#N" suffix in chronological order.
+    by_name = defaultdict(list)  # caption_name -> [orig_id...] (first-appearance order)
+    for orig_id in sorted(raw_resolved, key=lambda oid: first_seen[oid]):
+        by_name[raw_resolved[orig_id]].append(orig_id)
+
+    resolved = {}
+    for name, ids in by_name.items():
+        if len(ids) == 1:
+            resolved[ids[0]] = name
+        else:
+            for i, orig_id in enumerate(ids, start=1):
+                resolved[orig_id] = f"{name} Speaker #{i}"
 
     return resolved
 
@@ -1357,6 +1383,9 @@ def partial_transcribe_and_emit(audio_data, utterance_id, speaker_hint, batch_en
 
         speaker = speaker_hint or "Speaker 1"
         segment = {"text": full_transcript, "speaker": speaker, "start": 0.0, "end": audio_duration}
+        # Anchor the batch on wall-clock so the viewer can compute the true
+        # moment each sentence was spoken (seg.start is seconds from batch start).
+        batch_anchor_ms = batch_end_ts_ms if batch_end_ts_ms else int(time.time() * 1000)
         ws_payload = {
             "transcript": full_transcript,
             "source_language": source_lang,
@@ -1365,6 +1394,8 @@ def partial_transcribe_and_emit(audio_data, utterance_id, speaker_hint, batch_en
             "segments": [segment],
             "utterance_id": utterance_id,
             "is_partial": True,
+            "batch_end_ts_ms": batch_anchor_ms,
+            "audio_duration_secs": audio_duration,
         }
         socketio.emit("new_translation", ws_payload, room="admin")
         for lang_code, n in active_language_viewers.items():
@@ -1395,6 +1426,8 @@ def partial_transcribe_and_emit(audio_data, utterance_id, speaker_hint, batch_en
             "utterance_id": utterance_id,
             "is_partial": True,
             "is_update": True,
+            "batch_end_ts_ms": batch_anchor_ms,
+            "audio_duration_secs": audio_duration,
         }
         socketio.emit("new_translation", ws_payload_final, room="admin")
         for lang_code in translated_segments_by_lang:
@@ -1667,6 +1700,10 @@ def normalize_caps(text):
         # Check if it's time to generate a summary
         check_and_generate_summary()
 
+        # Wall-clock anchor for the batch so the viewer can compute each
+        # segment's true spoken-at time from batch_end_ts_ms + seg.start.
+        final_batch_anchor_ms = batch_end_ts_ms if batch_end_ts_ms else int(time.time() * 1000)
+
         # IMMEDIATELY send transcription to UI (before translations)
         # This makes the UI feel much more responsive
         ws_payload_initial = {
@@ -1678,6 +1715,8 @@ def normalize_caps(text):
             "utterance_id": utterance_id,
             "is_partial": False,
             "is_initial": True,  # Flag to indicate translations are coming
+            "batch_end_ts_ms": final_batch_anchor_ms,
+            "audio_duration_secs": audio_duration,
         }
 
         if Config.DEBUG:
@@ -1760,6 +1799,8 @@ def translate_segment_for_language(lang_info, segment):
             "utterance_id": utterance_id,
             "is_partial": False,
             "is_update": True,  # Flag to indicate this is a translation update
+            "batch_end_ts_ms": final_batch_anchor_ms,
+            "audio_duration_secs": audio_duration,
         }
 
         if Config.DEBUG:
diff --git a/templates/admin.html b/templates/admin.html
index 98215ea..57bd652 100644
--- a/templates/admin.html
+++ b/templates/admin.html
@@ -2051,7 +2051,12 @@ <h2>Settings</h2>
                         ? `<div style="font-weight:600; color:#a78bfa; margin-bottom:4px;">${_escHtml(g.speaker)}</div>`
                         : '';
                     const items = g.segs.map(seg => {
-                        const absMs = utteranceMs + (seg.start || 0) * 1000;
+                        let absMs;
+                        if (u.batch_end_ts_ms && u.audio_duration_secs != null) {
+                            absMs = u.batch_end_ts_ms - (u.audio_duration_secs - (seg.start || 0)) * 1000;
+                        } else {
+                            absMs = utteranceMs + (seg.start || 0) * 1000;
+                        }
                         const ss = String(new Date(absMs).getSeconds()).padStart(2, '0');
                         return `<li style="display:flex; gap:8px; align-items:baseline; margin-bottom:2px; list-style:none;"><span style="color:#6b7280; font-size:11px; font-variant-numeric:tabular-nums; flex-shrink:0; min-width:24px;">:${ss}</span><span>${_escHtml(seg.text)}</span></li>`;
                     }).join('');
@@ -2101,6 +2106,8 @@ <h2>Settings</h2>
                 const entry = {
                     utterance_id: uttId,
                     segments: data.segments,
+                    batch_end_ts_ms: data.batch_end_ts_ms || null,
+                    audio_duration_secs: data.audio_duration_secs || null,
                     is_partial: !!data.is_partial,
                     timestamp: data.timestamp,
                 };
diff --git a/templates/viewer.html b/templates/viewer.html
index 695d8f8..c49be15 100644
--- a/templates/viewer.html
+++ b/templates/viewer.html
@@ -869,7 +869,15 @@ <h1>Select Your Language</h1>
                     group.segs.forEach(seg => {
                         const li = document.createElement('li');
                         li.style.cssText = 'display:flex; gap:8px; align-items:baseline; margin-bottom:2px;';
-                        const absMs = utterance.timestamp + (seg.start || 0) * 1000;
+                        // Prefer batch_end_ts_ms anchor from the backend. Seg.start is
+                        // seconds from batch start, so absolute spoken-at time is
+                        // batch_end - (audio_duration - seg.start) * 1000.
+                        let absMs;
+                        if (utterance.batch_end_ts_ms && utterance.audio_duration_secs != null) {
+                            absMs = utterance.batch_end_ts_ms - (utterance.audio_duration_secs - (seg.start || 0)) * 1000;
+                        } else {
+                            absMs = utterance.timestamp + (seg.start || 0) * 1000;
+                        }
                         const d = new Date(absMs);
                         const ss = String(d.getSeconds()).padStart(2, '0');
                         const tsSpan = document.createElement('span');
@@ -942,13 +950,20 @@ <h1>Select Your Language</h1>
 
             const uttId = data.utterance_id || `anon_${_anonUtteranceCounter++}`;
             const segments = data.translated_segments[selectedLanguage].map(s => ({
-                speaker: s.speaker || null, text: s.text,
+                speaker: s.speaker || null,
+                text: s.text,
+                start: s.start,
+                end: s.end,
             }));
+            // True wall-clock anchor for the batch. Each segment's spoken-at
+            // time = batch_end_ts_ms - (audio_duration_secs - seg.start) * 1000.
             const entry = {
                 utterance_id: uttId,
                 segments: segments,
                 is_partial: !!data.is_partial,
                 timestamp: Date.now(),
+                batch_end_ts_ms: data.batch_end_ts_ms || null,
+                audio_duration_secs: data.audio_duration_secs || null,
             };
 
             const existingIdx = displayedTranslations.findIndex(u => u.utterance_id === uttId);