-
Notifications
You must be signed in to change notification settings - Fork 232
✨(summary) extended support for all video / audio files #1358
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,6 +7,7 @@ | |
| import subprocess | ||
| import tempfile | ||
| from contextlib import contextmanager | ||
| from dataclasses import dataclass | ||
| from datetime import timedelta | ||
| from pathlib import Path | ||
| from urllib.parse import urlparse | ||
|
|
@@ -23,6 +24,24 @@ | |
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class TranscribeError(ValueError): | ||
| """Base class for transcribe-related errors.""" | ||
|
|
||
| error_code: str = "unknown_error" | ||
|
|
||
|
|
||
| class MediaDurationTooLongError(TranscribeError): | ||
| """Raised when the duration of a media file exceeds the allowed limit.""" | ||
|
|
||
| error_code = "media_duration_too_long" | ||
|
|
||
|
|
||
| class NoAudioInFileError(TranscribeError): | ||
| """Raised when a media file does not contain any audio.""" | ||
|
|
||
| error_code = "no_audio_in_file" | ||
|
|
||
|
|
||
| def _get_duration_from_packets(local_path: Path) -> float: | ||
| """Estimate duration from audio packet timestamps.""" | ||
| # Run ffprobe to inspect the first audio stream in the file. | ||
|
|
@@ -84,7 +103,7 @@ | |
| return max(packet_ends) | ||
|
|
||
|
|
||
| def get_media_duration(local_path: Path): | ||
| def get_media_duration_seconds(local_path: Path) -> float: | ||
| """Get media (audio or video) file duration in seconds.""" | ||
| # ruff: noqa: S607 Hard to know the ffprobe path, it depends on the deployment | ||
| result = subprocess.run( | ||
|
|
@@ -114,6 +133,123 @@ | |
| return _get_duration_from_packets(local_path) | ||
|
|
||
|
|
||
| @dataclass(frozen=True) | ||
| class MediaInfo: | ||
| """Object containing information about the media file.""" | ||
|
|
||
| path: Path | ||
| has_audio: bool | ||
| has_video: bool | ||
| audio_duration_seconds: float | None | ||
|
lebaudantoine marked this conversation as resolved.
|
||
| audio_codec_name: str | None | ||
|
|
||
|
|
||
| def get_media_info(local_path: Path) -> MediaInfo: | ||
| """Determines if a media file contains an audio and / or a video stream. | ||
|
|
||
| This function checks if the given media file contains at least one | ||
| audio / video stream. It utilizes ffmpeg to analyze the media and parse | ||
| its metadata to detect the presence of audio / video content. | ||
| If appropriate, it also retrieves the codec name for the audio stream | ||
| and its duration. | ||
| """ | ||
| # ruff: noqa: S607 Hard to know the ffprobe path, it depends on the deployment | ||
| # ruff: noqa: S603 Input can be trusted | ||
| result = subprocess.run( | ||
| [ | ||
| "ffprobe", | ||
| "-v", | ||
| "error", | ||
| "-show_entries", | ||
| "stream=codec_type,codec_name", | ||
| "-print_format", | ||
| "json", | ||
| str(local_path), | ||
| ], | ||
| check=False, | ||
| stdout=subprocess.PIPE, | ||
| stderr=subprocess.PIPE, | ||
| text=True, | ||
| ) | ||
| data = json.loads(result.stdout) | ||
|
|
||
| streams = data.get("streams", []) | ||
| has_audio = any(el["codec_type"] == "audio" for el in streams) | ||
| has_video = any(el["codec_type"] == "video" for el in streams) | ||
| audio_codec_name = next( | ||
| ( | ||
| stream.get("codec_name") | ||
| for stream in streams | ||
| if stream.get("codec_type") == "audio" | ||
| ), | ||
| None, | ||
| ) | ||
| audio_duration_seconds = None | ||
| if has_audio: | ||
| audio_duration_seconds = get_media_duration_seconds(local_path) | ||
| return MediaInfo( | ||
| path=local_path, | ||
| has_audio=has_audio, | ||
| has_video=has_video, | ||
| audio_duration_seconds=audio_duration_seconds, | ||
| audio_codec_name=audio_codec_name, | ||
| ) | ||
|
|
||
|
|
||
| def extract_audio_from_video(media_info: MediaInfo) -> Path: | ||
| """Extracts the audio track from a video file and saves it as a separate audio file. | ||
|
|
||
| Based on the provided audio codec, | ||
| it determines an appropriate audio file extension. | ||
| It then runs | ||
| `ffmpeg` to extract the audio track without re-encoding it, ensuring quality is | ||
| preserved and speed. | ||
|
|
||
| Raises: | ||
| CalledProcessError: Raised if the `ffmpeg` command execution fails. | ||
| """ | ||
| if media_info.audio_codec_name is None: | ||
| raise ValueError("Media file must have codec name must be provided") | ||
|
|
||
| # .mka can contain a bunch of audio codecs, using it as a default | ||
| # to avoid potential issues with the ffmpeg copy later on | ||
| suffix = settings.codec_to_extension.get(media_info.audio_codec_name, ".mka") | ||
|
|
||
| with tempfile.NamedTemporaryFile( | ||
| suffix=suffix, | ||
| delete=False, | ||
| prefix="audio_extract_", | ||
| ) as tmp: | ||
| output_path = Path(tmp.name) | ||
|
|
||
| extract_command = [ | ||
| "ffmpeg", | ||
| "-i", | ||
| str(media_info.path), | ||
| "-vn", | ||
| "-acodec", | ||
| "copy", | ||
| "-y", | ||
| str(output_path), | ||
| ] | ||
|
|
||
| try: | ||
| subprocess.run(extract_command, check=True) | ||
| except BaseException as e: | ||
|
Check failure on line 238 in src/summary/summary/core/file_service.py
|
||
| if isinstance(e, FileNotFoundError): | ||
| logger.error("ffmpeg not found. Please install ffmpeg.") | ||
| elif isinstance(e, subprocess.CalledProcessError): | ||
| logger.error("Audio extraction failed: %s", e.stderr.decode()) | ||
| else: | ||
| logger.error("Unexpected error during audio extraction: %s", e) | ||
|
Check failure on line 244 in src/summary/summary/core/file_service.py
|
||
|
|
||
| if output_path.exists(): | ||
| os.remove(output_path) | ||
| raise RuntimeError("Failed to extract audio from file") from e | ||
|
Comment on lines
+238
to
+248
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in terms of python style I found the initial syntax with several except easier to scan/parse.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is to be reworked as a bigger clean of those file handles |
||
|
|
||
| return output_path | ||
|
|
||
|
|
||
| class FileServiceException(Exception): | ||
| """Base exception for file service operations.""" | ||
|
|
||
|
|
@@ -141,8 +277,7 @@ | |
| self._bucket_name = settings.aws_storage_bucket_name | ||
| self._stream_chunk_size = 32 * 1024 | ||
|
|
||
| self._allowed_extensions = settings.recording_allowed_extensions | ||
| self._max_duration = settings.recording_max_duration | ||
| self._max_duration_seconds = settings.recording_max_duration | ||
|
|
||
| def _download_from_minio(self, remote_object_key) -> Path: | ||
| """Download file from MinIO to local temporary file. | ||
|
|
@@ -158,10 +293,6 @@ | |
|
|
||
| extension = Path(remote_object_key).suffix.lower() | ||
|
|
||
| if extension not in self._allowed_extensions: | ||
| logger.warning("Invalid file extension '%s'", extension) | ||
| raise ValueError(f"Invalid file extension '{extension}'") | ||
|
|
||
| response = None | ||
|
|
||
| try: | ||
|
|
@@ -204,11 +335,6 @@ | |
| raise ValueError("Invalid cloud_storage_url") | ||
|
|
||
| extension = Path(urlparse(cloud_storage_url).path).suffix.lower() | ||
| if extension not in self._allowed_extensions: | ||
| logger.warning( | ||
| "Invalid file extension '%s' from cloud_storage_url", extension | ||
| ) | ||
| raise ValueError(f"Invalid file extension '{extension}'") | ||
|
|
||
| try: | ||
| with requests.get( | ||
|
|
@@ -244,63 +370,22 @@ | |
| "Unexpected error while downloading object from cloud_storage_url." | ||
| ) from e | ||
|
|
||
| def _validate_duration(self, local_path: Path) -> float: | ||
| def _validate_duration(self, duration_seconds: float) -> None: | ||
|
FloChehab marked this conversation as resolved.
|
||
| """Validate audio file duration against configured maximum.""" | ||
| duration = get_media_duration(local_path) | ||
|
|
||
| logger.info( | ||
| "Recording file duration: %.2f seconds", | ||
| duration, | ||
| duration_seconds, | ||
| ) | ||
|
|
||
| if self._max_duration is not None and duration > self._max_duration: | ||
| if ( | ||
| self._max_duration_seconds is not None | ||
| and duration_seconds > self._max_duration_seconds | ||
| ): | ||
| error_msg = "Recording too long. Limit is %.2fs seconds" % ( | ||
| self._max_duration, | ||
| self._max_duration_seconds, | ||
| ) | ||
| logger.error(error_msg) | ||
| raise ValueError(error_msg) | ||
|
|
||
| return duration | ||
|
|
||
| def _extract_audio_from_video(self, video_path: Path) -> Path: | ||
| """Extract audio from video file (e.g., MP4) and save as audio file.""" | ||
| logger.info("Extracting audio from video file: %s", video_path) | ||
|
|
||
| with tempfile.NamedTemporaryFile( | ||
| suffix=".m4a", delete=False, prefix="audio_extract_" | ||
| ) as tmp: | ||
| output_path = Path(tmp.name) | ||
|
|
||
| try: | ||
| command = [ | ||
| "ffmpeg", | ||
| "-i", | ||
| str(video_path), | ||
| "-vn", # No video | ||
| "-acodec", | ||
| "copy", | ||
| "-y", # Overwrite output file if exists | ||
| str(output_path), | ||
| ] | ||
|
|
||
| # ruff: noqa: S603 | ||
| subprocess.run( | ||
| command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True | ||
| ) | ||
|
|
||
| logger.info("Audio successfully extracted to: %s", output_path) | ||
| return output_path | ||
|
|
||
| except FileNotFoundError as e: | ||
| logger.error("ffmpeg not found. Please install ffmpeg.") | ||
| if output_path.exists(): | ||
| os.remove(output_path) | ||
| raise RuntimeError("ffmpeg is not installed or not in PATH") from e | ||
| except subprocess.CalledProcessError as e: | ||
| logger.error("Audio extraction failed: %s", e.stderr.decode()) | ||
| if output_path.exists(): | ||
| os.remove(output_path) | ||
| raise RuntimeError("Failed to extract audio.") from e | ||
| raise MediaDurationTooLongError(error_msg) | ||
|
|
||
| def read_json(self, object_name: str) -> dict: | ||
| """Read and parse a JSON file from MinIO storage.""" | ||
|
|
@@ -336,8 +421,8 @@ | |
| and yields an open file handle with metadata. Automatically cleans up | ||
| temporary files when the context exits. | ||
| """ | ||
| downloaded_path = None | ||
| processed_path = None | ||
| downloaded_path: Path | None = None | ||
| processed_path: Path | None = None | ||
| file_handle = None | ||
|
|
||
| try: | ||
|
|
@@ -356,18 +441,19 @@ | |
| else: | ||
| downloaded_path = self._download_from_minio(remote_object_key) | ||
|
|
||
| duration = self._validate_duration(downloaded_path) | ||
| media_info = get_media_info(downloaded_path) | ||
|
|
||
| extension = downloaded_path.suffix.lower() | ||
| if not media_info.has_audio: | ||
| raise NoAudioInFileError("Media file does not contain audio") | ||
| self._validate_duration(media_info.audio_duration_seconds) | ||
|
|
||
| if extension in settings.recording_video_extensions: | ||
| if media_info.has_video: | ||
| logger.info("Video file detected, extracting audio...") | ||
| extracted_audio_path = self._extract_audio_from_video(downloaded_path) | ||
| processed_path = extracted_audio_path | ||
| processed_path = extract_audio_from_video(media_info) | ||
| else: | ||
| processed_path = downloaded_path | ||
|
|
||
| metadata = {"duration": duration, "extension": extension} | ||
| metadata = {"duration": media_info.audio_duration_seconds} | ||
|
|
||
| file_handle = open(processed_path, "rb") | ||
| yield file_handle, metadata | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not related with this PR, can't
transcribe_audioreturns directly aWhisperXResponseinstance?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is indeed work to be done around using from pydantic models, especially in the helpers by @cameledev. I left that for another PR.