From 919b7f3c471b7018e57b80f21a1f7d04d269b7b1 Mon Sep 17 00:00:00 2001
From: Florent Chehab <flo@nextmile.fr>
Date: Thu, 21 May 2026 13:05:22 +0200
Subject: [PATCH] =?UTF-8?q?=E2=9C=A8(summary)=20extended=20support=20for?=
 =?UTF-8?q?=20all=20video/audio=20files?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Removed constraint on file extension
* Infer audio/video streams from the media with ffmpeg
* Infer the correct processed audio file extension based on actual
  codec to avoid ffmpeg errors

We need to support more extensions and make audio extraction dynamic,
as we shipped transcript in production and it led to user complaints
requesting more formats.
---
 CHANGELOG.md                                |   3 +-
 src/summary/summary/core/celery_worker.py   |  32 ++-
 src/summary/summary/core/config.py          |  20 +-
 src/summary/summary/core/file_service.py    | 224 ++++++++++++++------
 src/summary/summary/core/shared_models.py   |  12 +-
 src/summary/tests/unit/test_file_service.py | 126 +++++++++--
 6 files changed, 308 insertions(+), 109 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fdb8ea1088..d5119c63e8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,7 +15,8 @@ and this project adheres to
 - ✨(frontend) make reaction toolbar responsive on small viewports
 - ✨(frontend) enable reactions on mobile devices
 - ✨(frontend) introduce picture-in-picture meeting
-- ✨(backend) add core.recording.event.parsers.S3Parser 
+- ✨(backend) add core.recording.event.parsers.S3Parser
+- ✨(summary) extended support for all video / audio files #1358
 
 ### Changed 
 
diff --git a/src/summary/summary/core/celery_worker.py b/src/summary/summary/core/celery_worker.py
index 19c56f35b0..567c67e9a2 100644
--- a/src/summary/summary/core/celery_worker.py
+++ b/src/summary/summary/core/celery_worker.py
@@ -14,7 +14,11 @@
 
 from summary.core.analytics import MetadataManager, get_analytics
 from summary.core.config import get_settings
-from summary.core.file_service import FileService, FileServiceException
+from summary.core.file_service import (
+    FileService,
+    FileServiceException,
+    TranscribeError,
+)
 from summary.core.llm_service import LLMException, LLMObservability, LLMService
 from summary.core.locales import get_locale
 from summary.core.models import (
@@ -540,14 +544,24 @@ def process_audio_transcribe_v2_task(
 
     job_id = self.request.id
 
-    transcription_res = WhisperXResponse(
-        **transcribe_audio(  # type: ignore
-            task_id=job_id,
-            cloud_storage_url=payload.cloud_storage_url,
-            language=payload.language,
-            raises=True,
-        ).model_dump()
-    )
+    try:
+        transcription_res = WhisperXResponse(
+            **transcribe_audio(  # type: ignore
+                task_id=job_id,
+                cloud_storage_url=payload.cloud_storage_url,
+                language=payload.language,
+                raises=True,
+            ).model_dump()
+        )
+    except TranscribeError as e:
+        failure_payload = TranscribeWebhookFailurePayload(
+            job_id=job_id,
+            error_code=e.error_code,
+        )
+        call_webhook_v2_task.apply_async(
+            args=[failure_payload.model_dump(), payload.tenant_id]
+        )
+        return failure_payload.model_dump()
 
     file_service.store_transcript(
         transcript=transcription_res,
diff --git a/src/summary/summary/core/config.py b/src/summary/summary/core/config.py
index 6858a49d0d..1e00100340 100644
--- a/src/summary/summary/core/config.py
+++ b/src/summary/summary/core/config.py
@@ -56,16 +56,16 @@ class Settings(BaseSettings):
 
     # Audio recordings
     recording_max_duration: Optional[int] = None
-    recording_allowed_extensions: Set[str] = {
-        ".ogg",
-        ".mp4",
-        ".m4a",
-        ".webm",
-        ".ogv",
-        ".opus",
-        ".wav",
-    }
-    recording_video_extensions: Set[str] = {".mp4"}
+    codec_to_extension: dict[str, str] = Field(
+        default_factory=lambda: {
+            "aac": ".m4a",
+            "alac": ".m4a",
+            "mp3": ".mp3",
+            "opus": ".opus",
+            "vorbis": ".ogg",
+            "flac": ".flac",
+        }
+    )
 
     # Celery settings
     celery_broker_url: str = "redis://redis/0"
diff --git a/src/summary/summary/core/file_service.py b/src/summary/summary/core/file_service.py
index 994b3d1ddf..ea91d12c3f 100644
--- a/src/summary/summary/core/file_service.py
+++ b/src/summary/summary/core/file_service.py
@@ -7,6 +7,7 @@
 import subprocess
 import tempfile
 from contextlib import contextmanager
+from dataclasses import dataclass
 from datetime import timedelta
 from pathlib import Path
 from urllib.parse import urlparse
@@ -23,6 +24,24 @@
 logger = logging.getLogger(__name__)
 
 
+class TranscribeError(ValueError):
+    """Base class for transcribe-related errors."""
+
+    error_code: str = "unknown_error"
+
+
+class MediaDurationTooLongError(TranscribeError):
+    """Raised when the duration of a media file exceeds the allowed limit."""
+
+    error_code = "media_duration_too_long"
+
+
+class NoAudioInFileError(TranscribeError):
+    """Raised when a media file does not contain any audio."""
+
+    error_code = "no_audio_in_file"
+
+
 def _get_duration_from_packets(local_path: Path) -> float:
     """Estimate duration from audio packet timestamps."""
     # Run ffprobe to inspect the first audio stream in the file.
@@ -84,7 +103,7 @@ def _get_duration_from_packets(local_path: Path) -> float:
     return max(packet_ends)
 
 
-def get_media_duration(local_path: Path):
+def get_media_duration_seconds(local_path: Path) -> float:
     """Get media (audio or video) file duration in seconds."""
     # ruff: noqa: S607 Hard to know the ffprobe path, it depends on the deployment
     result = subprocess.run(
@@ -114,6 +133,123 @@ def get_media_duration(local_path: Path):
     return _get_duration_from_packets(local_path)
 
 
+@dataclass(frozen=True)
+class MediaInfo:
+    """Object containing information about the media file."""
+
+    path: Path
+    has_audio: bool
+    has_video: bool
+    audio_duration_seconds: float | None
+    audio_codec_name: str | None
+
+
+def get_media_info(local_path: Path) -> MediaInfo:
+    """Determines if a media file contains an audio and / or a video stream.
+
+    This function checks if the given media file contains at least one
+    audio / video stream. It utilizes ffmpeg to analyze the media and parse
+    its metadata to detect the presence of audio / video content.
+    If appropriate, it also retrieves the codec name for the audio stream
+    and its duration.
+    """
+    # ruff: noqa: S607 Hard to know the ffprobe path, it depends on the deployment
+    # ruff: noqa: S603 Input can be trusted
+    result = subprocess.run(
+        [
+            "ffprobe",
+            "-v",
+            "error",
+            "-show_entries",
+            "stream=codec_type,codec_name",
+            "-print_format",
+            "json",
+            str(local_path),
+        ],
+        check=False,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+    )
+    data = json.loads(result.stdout)
+
+    streams = data.get("streams", [])
+    has_audio = any(el["codec_type"] == "audio" for el in streams)
+    has_video = any(el["codec_type"] == "video" for el in streams)
+    audio_codec_name = next(
+        (
+            stream.get("codec_name")
+            for stream in streams
+            if stream.get("codec_type") == "audio"
+        ),
+        None,
+    )
+    audio_duration_seconds = None
+    if has_audio:
+        audio_duration_seconds = get_media_duration_seconds(local_path)
+    return MediaInfo(
+        path=local_path,
+        has_audio=has_audio,
+        has_video=has_video,
+        audio_duration_seconds=audio_duration_seconds,
+        audio_codec_name=audio_codec_name,
+    )
+
+
+def extract_audio_from_video(media_info: MediaInfo) -> Path:
+    """Extracts the audio track from a video file and saves it as a separate audio file.
+
+    Based on the provided audio codec,
+    it determines an appropriate audio file extension.
+    It then runs
+    `ffmpeg` to extract the audio track without re-encoding it, ensuring quality is
+    preserved and speed.
+
+    Raises:
+        CalledProcessError: Raised if the `ffmpeg` command execution fails.
+    """
+    if media_info.audio_codec_name is None:
+        raise ValueError("Media file must have codec name must be provided")
+
+    # .mka can contain a bunch of audio codecs, using it as a default
+    # to avoid potential issues with the ffmpeg copy later on
+    suffix = settings.codec_to_extension.get(media_info.audio_codec_name, ".mka")
+
+    with tempfile.NamedTemporaryFile(
+        suffix=suffix,
+        delete=False,
+        prefix="audio_extract_",
+    ) as tmp:
+        output_path = Path(tmp.name)
+
+    extract_command = [
+        "ffmpeg",
+        "-i",
+        str(media_info.path),
+        "-vn",
+        "-acodec",
+        "copy",
+        "-y",
+        str(output_path),
+    ]
+
+    try:
+        subprocess.run(extract_command, check=True)
+    except BaseException as e:
+        if isinstance(e, FileNotFoundError):
+            logger.error("ffmpeg not found. Please install ffmpeg.")
+        elif isinstance(e, subprocess.CalledProcessError):
+            logger.error("Audio extraction failed: %s", e.stderr.decode())
+        else:
+            logger.error("Unexpected error during audio extraction: %s", e)
+
+        if output_path.exists():
+            os.remove(output_path)
+        raise RuntimeError("Failed to extract audio from file") from e
+
+    return output_path
+
+
 class FileServiceException(Exception):
     """Base exception for file service operations."""
 
@@ -141,8 +277,7 @@ def __init__(self):
         self._bucket_name = settings.aws_storage_bucket_name
         self._stream_chunk_size = 32 * 1024
 
-        self._allowed_extensions = settings.recording_allowed_extensions
-        self._max_duration = settings.recording_max_duration
+        self._max_duration_seconds = settings.recording_max_duration
 
     def _download_from_minio(self, remote_object_key) -> Path:
         """Download file from MinIO to local temporary file.
@@ -158,10 +293,6 @@ def _download_from_minio(self, remote_object_key) -> Path:
 
         extension = Path(remote_object_key).suffix.lower()
 
-        if extension not in self._allowed_extensions:
-            logger.warning("Invalid file extension '%s'", extension)
-            raise ValueError(f"Invalid file extension '{extension}'")
-
         response = None
 
         try:
@@ -204,11 +335,6 @@ def _download_from_cloud_storage_url(self, cloud_storage_url: str) -> Path:
             raise ValueError("Invalid cloud_storage_url")
 
         extension = Path(urlparse(cloud_storage_url).path).suffix.lower()
-        if extension not in self._allowed_extensions:
-            logger.warning(
-                "Invalid file extension '%s' from cloud_storage_url", extension
-            )
-            raise ValueError(f"Invalid file extension '{extension}'")
 
         try:
             with requests.get(
@@ -244,63 +370,22 @@ def _download_from_cloud_storage_url(self, cloud_storage_url: str) -> Path:
                 "Unexpected error while downloading object from cloud_storage_url."
             ) from e
 
-    def _validate_duration(self, local_path: Path) -> float:
+    def _validate_duration(self, duration_seconds: float) -> None:
         """Validate audio file duration against configured maximum."""
-        duration = get_media_duration(local_path)
-
         logger.info(
             "Recording file duration: %.2f seconds",
-            duration,
+            duration_seconds,
         )
 
-        if self._max_duration is not None and duration > self._max_duration:
+        if (
+            self._max_duration_seconds is not None
+            and duration_seconds > self._max_duration_seconds
+        ):
             error_msg = "Recording too long. Limit is %.2fs seconds" % (
-                self._max_duration,
+                self._max_duration_seconds,
             )
             logger.error(error_msg)
-            raise ValueError(error_msg)
-
-        return duration
-
-    def _extract_audio_from_video(self, video_path: Path) -> Path:
-        """Extract audio from video file (e.g., MP4) and save as audio file."""
-        logger.info("Extracting audio from video file: %s", video_path)
-
-        with tempfile.NamedTemporaryFile(
-            suffix=".m4a", delete=False, prefix="audio_extract_"
-        ) as tmp:
-            output_path = Path(tmp.name)
-
-        try:
-            command = [
-                "ffmpeg",
-                "-i",
-                str(video_path),
-                "-vn",  # No video
-                "-acodec",
-                "copy",
-                "-y",  # Overwrite output file if exists
-                str(output_path),
-            ]
-
-            # ruff: noqa: S603
-            subprocess.run(
-                command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True
-            )
-
-            logger.info("Audio successfully extracted to: %s", output_path)
-            return output_path
-
-        except FileNotFoundError as e:
-            logger.error("ffmpeg not found. Please install ffmpeg.")
-            if output_path.exists():
-                os.remove(output_path)
-            raise RuntimeError("ffmpeg is not installed or not in PATH") from e
-        except subprocess.CalledProcessError as e:
-            logger.error("Audio extraction failed: %s", e.stderr.decode())
-            if output_path.exists():
-                os.remove(output_path)
-            raise RuntimeError("Failed to extract audio.") from e
+            raise MediaDurationTooLongError(error_msg)
 
     def read_json(self, object_name: str) -> dict:
         """Read and parse a JSON file from MinIO storage."""
@@ -336,8 +421,8 @@ def prepare_audio_file(
         and yields an open file handle with metadata. Automatically cleans up
         temporary files when the context exits.
         """
-        downloaded_path = None
-        processed_path = None
+        downloaded_path: Path | None = None
+        processed_path: Path | None = None
         file_handle = None
 
         try:
@@ -356,18 +441,19 @@ def prepare_audio_file(
             else:
                 downloaded_path = self._download_from_minio(remote_object_key)
 
-            duration = self._validate_duration(downloaded_path)
+            media_info = get_media_info(downloaded_path)
 
-            extension = downloaded_path.suffix.lower()
+            if not media_info.has_audio:
+                raise NoAudioInFileError("Media file does not contain audio")
+            self._validate_duration(media_info.audio_duration_seconds)
 
-            if extension in settings.recording_video_extensions:
+            if media_info.has_video:
                 logger.info("Video file detected, extracting audio...")
-                extracted_audio_path = self._extract_audio_from_video(downloaded_path)
-                processed_path = extracted_audio_path
+                processed_path = extract_audio_from_video(media_info)
             else:
                 processed_path = downloaded_path
 
-            metadata = {"duration": duration, "extension": extension}
+            metadata = {"duration": media_info.audio_duration_seconds}
 
             file_handle = open(processed_path, "rb")
             yield file_handle, metadata
diff --git a/src/summary/summary/core/shared_models.py b/src/summary/summary/core/shared_models.py
index 043ea75910..6ce6f584a6 100644
--- a/src/summary/summary/core/shared_models.py
+++ b/src/summary/summary/core/shared_models.py
@@ -89,9 +89,11 @@ class TranscribeWebhookFailurePayload(BaseWebhook):
 
     type: Literal["transcript"] = Field(default="transcript")
     status: Literal["failure"] = Field(default="failure")
-    error_code: Literal["unknown_error"] = Field(
-        title="Error code", description="The error code."
-    )
+    # we authorized any other string than the one in the literal
+    # to avoid causing a breaking change on the client side
+    error_code: (
+        Literal["unknown_error", "no_audio_in_file", "media_duration_too_long"] | str
+    ) = Field(title="Error code", description="The error code.")
 
 
 TranscribeWebhookPayloads = Annotated[
@@ -126,7 +128,9 @@ class SummarizeWebhookFailurePayload(BaseWebhook):
 
     type: Literal["summary"] = Field(default="summary")
     status: Literal["failure"] = Field(default="failure")
-    error_code: Literal["unknown_error"] = Field(
+    # we authorized any other string than unkown_error
+    # to avoid causing a breaking change on the client side
+    error_code: Literal["unknown_error"] | str = Field(
         title="Error code", description="The error code."
     )
 
diff --git a/src/summary/tests/unit/test_file_service.py b/src/summary/tests/unit/test_file_service.py
index 30a70d8759..8f9a92dd27 100644
--- a/src/summary/tests/unit/test_file_service.py
+++ b/src/summary/tests/unit/test_file_service.py
@@ -4,27 +4,121 @@
 
 import pytest
 
-from summary.core.file_service import get_media_duration
+from summary.core.file_service import (
+    MediaInfo,
+    extract_audio_from_video,
+    get_media_info,
+)
+
+BASE_PATH = Path(__file__).parent.parent / "assets"
+
+MEDIA_INFO_SAMPLE_VISIO = MediaInfo(
+    path=BASE_PATH / "video-sample-visio.mp4",
+    has_audio=True,
+    has_video=True,
+    audio_duration_seconds=5.34059,
+    audio_codec_name="aac",
+)
 
 
 @pytest.mark.parametrize(
-    "filename, duration",
+    "media_info",
     [
-        ("audio-sample-android-chrome.webm", 2.2795),
-        ("audio-sample-android-firefox.ogg", 2.3025),
-        ("audio-sample-android.m4a", 1.38),
-        ("audio-sample-chromium.webm", 2.65),
-        ("audio-sample-firefox.ogg", 2.0865),
-        ("audio-sample-ios-browser.webm", 2.6229),
-        ("audio-sample-ios.m4a", 1.408),
-        ("audio-sample-mac-os-safari.webm", 2.3049),
-        ("video-sample-visio.mp4", 5.34059),
+        MediaInfo(
+            path=BASE_PATH / "audio-sample-android-chrome.webm",
+            has_audio=True,
+            has_video=False,
+            audio_duration_seconds=2.2795,
+            audio_codec_name="opus",
+        ),
+        MediaInfo(
+            path=BASE_PATH / "audio-sample-android-firefox.ogg",
+            has_audio=True,
+            has_video=False,
+            audio_duration_seconds=2.3025,
+            audio_codec_name="opus",
+        ),
+        MediaInfo(
+            path=BASE_PATH / "audio-sample-android.m4a",
+            has_audio=True,
+            has_video=False,
+            audio_duration_seconds=1.38,
+            audio_codec_name="aac",
+        ),
+        MediaInfo(
+            path=BASE_PATH / "audio-sample-chromium.webm",
+            has_audio=True,
+            has_video=False,
+            audio_duration_seconds=2.65,
+            audio_codec_name="opus",
+        ),
+        MediaInfo(
+            path=BASE_PATH / "audio-sample-firefox.ogg",
+            has_audio=True,
+            has_video=False,
+            audio_duration_seconds=2.0865,
+            audio_codec_name="opus",
+        ),
+        MediaInfo(
+            path=BASE_PATH / "audio-sample-ios-browser.webm",
+            has_audio=True,
+            has_video=False,
+            audio_duration_seconds=2.6229,
+            audio_codec_name="opus",
+        ),
+        MediaInfo(
+            path=BASE_PATH / "audio-sample-ios.m4a",
+            has_audio=True,
+            has_video=False,
+            audio_duration_seconds=1.408,
+            audio_codec_name="aac",
+        ),
+        MediaInfo(
+            path=BASE_PATH / "audio-sample-mac-os-safari.webm",
+            has_audio=True,
+            has_video=False,
+            audio_duration_seconds=2.3049,
+            audio_codec_name="opus",
+        ),
+        MEDIA_INFO_SAMPLE_VISIO,
     ],
 )
-def test_validate_duration_supports_all_used_file_formats(
-    filename: str, duration: float
+def test_validate_media_info_supports_all_used_file_formats(
+    media_info: MediaInfo,
 ) -> None:
-    """Validate duration for Safari iPhone WebM files without format duration."""
-    audio_path = Path(__file__).parent.parent / "assets" / filename
+    """Validate media_info for different files that the service can handle."""
+    res = get_media_info(media_info.path)
+    assert res.has_audio == media_info.has_audio
+    assert res.has_video == media_info.has_video
+    assert res.audio_codec_name == media_info.audio_codec_name
+    if res.has_audio:
+        assert res.audio_duration_seconds == pytest.approx(
+            media_info.audio_duration_seconds, 1e-3
+        )
+    else:
+        assert res.audio_duration_seconds is None
+
+
+def test_media_info_invalid_file() -> None:
+    """Test that media_info returns correct values for invalid files."""
+    assert get_media_info(Path(__file__)) == MediaInfo(
+        path=Path(__file__),
+        has_audio=False,
+        has_video=False,
+        audio_duration_seconds=None,
+        audio_codec_name=None,
+    )
+
 
-    assert get_media_duration(audio_path) == pytest.approx(duration, 1e-3)
+def test_extract_audio_from_video():
+    """Test that extract_audio_from_video can extract audio from a video file."""
+    path = None
+    # A bit of cleanup logic since this is not a generator
+    try:
+        path = extract_audio_from_video(MEDIA_INFO_SAMPLE_VISIO)
+        assert path.name.endswith(".m4a")
+    except Exception as e:
+        pytest.fail(f"Failed to extract audio from video: {e}")
+    finally:
+        if path and path.exists():
+            path.unlink()