diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index fba824f..10a3b69 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] + python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v6 diff --git a/pyproject.toml b/pyproject.toml index 749769e..e8c3c64 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,9 +7,9 @@ authors = [ ] license = {text = "MIT"} readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.10" dependencies = [ - "av", + "av>=14.1", # VideoFrame.rotation was added in 14.1 "numpy", ] diff --git a/simple_video_utils/frames.py b/simple_video_utils/frames.py index 64763e1..ce12142 100644 --- a/simple_video_utils/frames.py +++ b/simple_video_utils/frames.py @@ -1,4 +1,5 @@ -from typing import BinaryIO, Generator, Optional, Tuple +from collections.abc import Generator +from typing import BinaryIO import av import numpy as np @@ -6,10 +7,29 @@ from simple_video_utils.metadata import VideoMetadata, _open_container, video_metadata_from_container +def _frame_to_rgb(frame: av.VideoFrame) -> np.ndarray: + """ + Convert a frame to an RGB array in display orientation. + + PyAV decodes frames in their stored orientation and does not apply the + container's display-matrix rotation (unlike the ffmpeg CLI, which + autorotates). Phone-recorded videos commonly store landscape frames with + a 90° rotation tag, so we apply it here. + """ + array = frame.to_ndarray(format='rgb24') + rotation = frame.rotation % 360 + if rotation and rotation % 90 == 0: + # rotation=90 with k=1 (counterclockwise) matches ffmpeg autorotate pixel-exactly. + # np.rot90 returns a non-contiguous view, which consumers like MediaPipe + # and OpenCV reject — copy to a contiguous array. + array = np.ascontiguousarray(np.rot90(array, k=rotation // 90)) + return array + + def _generate_frames( container: av.container.InputContainer, skip_frames: int = 0, - max_frames: Optional[int] = None, + max_frames: int | None = None, ) -> Generator[np.ndarray, None, None]: """ Generate RGB frames from a container's current position. @@ -23,7 +43,7 @@ def _generate_frames( max_frames: Maximum number of frames to yield, or None for all remaining. Yields: - RGB numpy arrays (H, W, 3) for frames after skipping. + RGB numpy arrays (H, W, 3) in display orientation for frames after skipping. """ frames_decoded = 0 frames_yielded = 0 @@ -33,7 +53,7 @@ def _generate_frames( frames_decoded += 1 continue - yield frame.to_ndarray(format='rgb24') + yield _frame_to_rgb(frame) frames_yielded += 1 if max_frames is not None and frames_yielded >= max_frames: @@ -41,11 +61,11 @@ def _generate_frames( frames_decoded += 1 def _validate_parameters( - start_frame: Optional[int], - end_frame: Optional[int], - start_time: Optional[float], - end_time: Optional[float], -) -> Tuple[bool, bool]: + start_frame: int | None, + end_frame: int | None, + start_time: float | None, + end_time: float | None, +) -> tuple[bool, bool]: """Validate that time and frame parameters aren't mixed.""" has_frame_params = start_frame is not None or end_frame is not None has_time_params = start_time is not None or end_time is not None @@ -58,10 +78,10 @@ def _validate_parameters( def _convert_time_to_frames( - start_time: Optional[float], - end_time: Optional[float], + start_time: float | None, + end_time: float | None, fps: float, -) -> Tuple[int, Optional[int]]: +) -> tuple[int, int | None]: """Convert time-based parameters to frame indices.""" start = int((start_time or 0.0) * fps) end = int(end_time * fps) if end_time is not None else None @@ -74,9 +94,9 @@ def _convert_time_to_frames( def _normalize_frame_range( - start_frame: Optional[int], - end_frame: Optional[int], -) -> Tuple[int, Optional[int]]: + start_frame: int | None, + end_frame: int | None, +) -> tuple[int, int | None]: """Normalize frame parameters with defaults and validation.""" start = start_frame if start_frame is not None else 0 @@ -122,10 +142,10 @@ def _calculate_seek_position( def read_frames_exact( src: str, - start_frame: Optional[int] = None, - end_frame: Optional[int] = None, - start_time: Optional[float] = None, - end_time: Optional[float] = None, + start_frame: int | None = None, + end_frame: int | None = None, + start_time: float | None = None, + end_time: float | None = None, thread_type: str = "AUTO", ) -> Generator[np.ndarray, None, None]: """ @@ -195,7 +215,7 @@ def read_frames_from_stream( skip_frames: int = 0, thread_type: str = "AUTO", buffer_size: int = 32768, # PyAV default buffer size, can be reduced for lower latency when realtime streaming -) -> Tuple[VideoMetadata, Generator[np.ndarray, None, None]]: +) -> tuple[VideoMetadata, Generator[np.ndarray, None, None]]: """ Read frames from a video stream (file-like object). @@ -217,13 +237,29 @@ def read_frames_from_stream( seeking (MP4 with moov at end), the stream must be fully available. """ container = av.open(stream, mode='r', buffer_size=buffer_size) - for s in container.streams.video: - s.thread_type = thread_type - meta = video_metadata_from_container(container) + try: + for s in container.streams.video: + s.thread_type = thread_type + + # The display-matrix rotation is only exposed per-frame, and the stream may + # not be seekable (e.g. a pipe) — so decode the first frame eagerly for the + # metadata and hand it back through the generator. + first_frame = next(container.decode(video=0), None) + rotation = first_frame.rotation if first_frame is not None else 0 + meta = video_metadata_from_container(container, rotation=rotation) + except Exception: + container.close() + raise def frame_generator() -> Generator[np.ndarray, None, None]: try: - yield from _generate_frames(container, skip_frames=skip_frames, max_frames=None) + remaining_skip = skip_frames + if first_frame is not None: + if remaining_skip == 0: + yield _frame_to_rgb(first_frame) + else: + remaining_skip -= 1 + yield from _generate_frames(container, skip_frames=remaining_skip, max_frames=None) finally: container.close() diff --git a/simple_video_utils/metadata.py b/simple_video_utils/metadata.py index 257cc7d..5537d6c 100644 --- a/simple_video_utils/metadata.py +++ b/simple_video_utils/metadata.py @@ -1,7 +1,7 @@ import io from contextlib import contextmanager from functools import lru_cache -from typing import NamedTuple, Optional, Union +from typing import NamedTuple import av @@ -10,13 +10,14 @@ class VideoMetadata(NamedTuple): width: int height: int fps: float - nb_frames: Optional[int] - time_base: Optional[str] - duration: Optional[float] # seconds; None if the container header doesn't carry one + nb_frames: int | None + time_base: str | None + duration: float | None # seconds; None if the container header doesn't carry one + rotation: int = 0 # display-matrix rotation in degrees; width/height already account for it @contextmanager -def _open_container(source: Union[str, io.BytesIO]): +def _open_container(source: str | io.BytesIO): """Context manager for safely opening and closing PyAV containers.""" container = None try: @@ -30,8 +31,39 @@ def _open_container(source: Union[str, io.BytesIO]): container.close() -def video_metadata_from_container(container: av.container.InputContainer) -> VideoMetadata: - """Extract metadata from an open PyAV container.""" +def _probe_rotation(container: av.container.InputContainer) -> int: + """ + Read the display-matrix rotation by decoding the first frame, then rewind. + + PyAV only exposes the rotation per-frame (``VideoFrame.rotation``), not on + the stream. Requires a seekable container; returns 0 if the video can't be + decoded. + """ + try: + frame = next(container.decode(video=0), None) + rotation = frame.rotation if frame is not None else 0 + except (av.FFmpegError, OSError): + rotation = 0 + container.seek(0) + return rotation + + +def video_metadata_from_container( + container: av.container.InputContainer, + rotation: int | None = None, +) -> VideoMetadata: + """ + Extract metadata from an open PyAV container. + + Width/height are reported in display orientation (rotation applied), + matching the frames yielded by the frames module. + + Args: + container: Open PyAV container. + rotation: Display rotation in degrees if already known (e.g. from a + decoded frame). When None, it is probed by decoding the first + frame and rewinding — pass it explicitly for non-seekable input. + """ stream = container.streams.video[0] fps = float(stream.average_rate) if stream.average_rate else 0.0 nb_frames = stream.frames if stream.frames > 0 else None @@ -50,13 +82,22 @@ def video_metadata_from_container(container: av.container.InputContainer) -> Vid else: duration = None + if rotation is None: + rotation = _probe_rotation(container) + rotation %= 360 + + width, height = stream.width, stream.height + if rotation % 180 == 90: + width, height = height, width + return VideoMetadata( - width=stream.width, - height=stream.height, + width=width, + height=height, fps=fps, nb_frames=nb_frames, time_base=time_base, duration=duration, + rotation=rotation, ) diff --git a/tests/assets/rotated90.mp4 b/tests/assets/rotated90.mp4 new file mode 100644 index 0000000..2d78b80 Binary files /dev/null and b/tests/assets/rotated90.mp4 differ diff --git a/tests/test_frames.py b/tests/test_frames.py index b3d224b..dba8ec2 100644 --- a/tests/test_frames.py +++ b/tests/test_frames.py @@ -75,7 +75,7 @@ def test_sequential_vs_range_reading(self, video_path): assert len(range_frames) == len(individual_frames) == 3 - for range_frame, individual_frame in zip(range_frames, individual_frames): + for range_frame, individual_frame in zip(range_frames, individual_frames, strict=False): np.testing.assert_array_equal(range_frame, individual_frame) def test_frames_are_different(self, video_path): @@ -165,7 +165,7 @@ def test_end_frame_none_consistency(self, video_path): assert len(frames1) == len(frames2) # Frames should be identical - for f1, f2 in zip(frames1, frames2): + for f1, f2 in zip(frames1, frames2, strict=False): np.testing.assert_array_equal(f1, f2) def test_end_frame_none_vs_explicit_end(self, video_path): @@ -253,7 +253,7 @@ def test_time_vs_frame_equivalence(self, video_path): assert len(frames_by_index) == len(frames_by_time) # Frames should be identical - for i, (frame_idx, frame_time) in enumerate(zip(frames_by_index, frames_by_time)): + for i, (frame_idx, frame_time) in enumerate(zip(frames_by_index, frames_by_time, strict=False)): np.testing.assert_array_equal( frame_idx, frame_time, @@ -297,7 +297,7 @@ def test_no_parameters_reads_all(self, video_path): # Should produce same result assert len(frames_no_params) == len(frames_explicit) - for f1, f2 in zip(frames_no_params, frames_explicit): + for f1, f2 in zip(frames_no_params, frames_explicit, strict=False): np.testing.assert_array_equal(f1, f2) def test_time_vs_frame_seeking_precision_remote(self): @@ -336,7 +336,7 @@ def test_time_vs_frame_seeking_precision_remote(self): ) # Every frame should be identical - for i, (frame_time, frame_idx) in enumerate(zip(frames_by_time, frames_by_frame)): + for i, (frame_time, frame_idx) in enumerate(zip(frames_by_time, frames_by_frame, strict=False)): actual_frame_num = start_frame_idx + i np.testing.assert_array_equal( frame_time, @@ -410,7 +410,7 @@ def test_read_frames_from_stream_all_frames(self, video_bytes, video_path): assert len(stream_frames) == len(file_frames) # Frames should be identical - for i, (stream_frame, file_frame) in enumerate(zip(stream_frames, file_frames)): + for i, (stream_frame, file_frame) in enumerate(zip(stream_frames, file_frames, strict=False)): np.testing.assert_array_equal( stream_frame, file_frame, @@ -430,7 +430,7 @@ def test_read_frames_from_stream_skip_frames(self, video_bytes, video_path): assert len(stream_frames) == len(file_frames) - for i, (stream_frame, file_frame) in enumerate(zip(stream_frames, file_frames)): + for i, (stream_frame, file_frame) in enumerate(zip(stream_frames, file_frames, strict=False)): np.testing.assert_array_equal( stream_frame, file_frame, diff --git a/tests/test_regression.py b/tests/test_regression.py index 76e151b..31f222d 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -2,9 +2,10 @@ import json import subprocess +from collections.abc import Generator from functools import lru_cache from pathlib import Path -from typing import Generator, NamedTuple, Optional +from typing import NamedTuple import numpy as np import pytest @@ -17,8 +18,8 @@ class VideoMetadata(NamedTuple): width: int height: int fps: float - nb_frames: Optional[int] - time_base: Optional[str] + nb_frames: int | None + time_base: str | None @lru_cache(maxsize=8) @@ -53,7 +54,7 @@ def ffprobe(url_or_path: str) -> VideoMetadata: def ffmpeg_read_frames_exact( # noqa: C901 src: str, start_frame: int, - end_frame: Optional[int] = None, + end_frame: int | None = None, ) -> Generator[np.ndarray, None, None]: """ Return frames [start_frame, end_frame] inclusive as RGB np.ndarrays using ffmpeg. @@ -194,7 +195,7 @@ def test_frames_match_ffmpeg_from_start(self, video_path): ) # Every frame should be identical (pixel-perfect) - for i, (pyav_frame, ffmpeg_frame) in enumerate(zip(pyav_frames, ffmpeg_frames)): + for i, (pyav_frame, ffmpeg_frame) in enumerate(zip(pyav_frames, ffmpeg_frames, strict=False)): np.testing.assert_array_equal( pyav_frame, ffmpeg_frame, @@ -265,7 +266,7 @@ def test_frames_match_ffmpeg_time_based(self, video_path): ) # Every frame should be identical - for i, (pyav_frame, ffmpeg_frame) in enumerate(zip(pyav_frames, ffmpeg_frames)): + for i, (pyav_frame, ffmpeg_frame) in enumerate(zip(pyav_frames, ffmpeg_frames, strict=False)): actual_frame_num = start_frame + i np.testing.assert_array_equal( pyav_frame, diff --git a/tests/test_rotation.py b/tests/test_rotation.py new file mode 100644 index 0000000..2dbdfa9 --- /dev/null +++ b/tests/test_rotation.py @@ -0,0 +1,141 @@ +"""Regression tests for videos with a display-matrix rotation (e.g. phone recordings). + +PyAV decodes frames in their stored orientation and does not apply the +rotation side data. These tests ensure frames are rotated to display +orientation and metadata reports display dimensions, matching the ffmpeg +CLI's autorotate behavior. +""" + +import os +import subprocess +import threading +from io import BytesIO +from pathlib import Path + +import numpy as np +import pytest + +from simple_video_utils.frames import read_frames_exact, read_frames_from_stream +from simple_video_utils.metadata import video_metadata, video_metadata_from_bytes + + +@pytest.fixture +def video_path(): + """Vertical phone-style video: stored as 640x360 landscape with rotation=90.""" + return str(Path(__file__).parent / "assets" / "rotated90.mp4") + + +@pytest.fixture +def video_bytes(video_path): + """Load the rotated video as bytes.""" + return Path(video_path).read_bytes() + + +def ffmpeg_autorotated_frames(src: str, num_frames: int, width: int, height: int) -> np.ndarray: + """Decode frames with the ffmpeg CLI, which applies the display rotation.""" + cmd = [ + "ffmpeg", "-v", "error", + "-i", src, + "-frames:v", str(num_frames), + "-f", "rawvideo", "-pix_fmt", "rgb24", + "pipe:1", + ] + out = subprocess.run(cmd, check=True, capture_output=True).stdout + return np.frombuffer(out, dtype=np.uint8).reshape(num_frames, height, width, 3) + + +def assert_frames_close(frame: np.ndarray, ref: np.ndarray, err_msg: str): + """Assert frames match up to YUV->RGB rounding differences between ffmpeg builds. + + A wrong rotation direction produces a mean difference of ~70, so the + tolerance still catches orientation errors. + """ + assert frame.shape == ref.shape, err_msg + diff = np.abs(frame.astype(int) - ref.astype(int)) + assert diff.max() <= 3, f"{err_msg}: max diff {diff.max()}" + assert diff.mean() < 1, f"{err_msg}: mean diff {diff.mean():.2f}" + + +class TestRotatedVideo: + """Tests using a 90°-rotated vertical video.""" + + def test_metadata_reports_display_orientation(self, video_path): + """Metadata width/height must be the display (portrait) dimensions.""" + meta = video_metadata(video_path) + + assert (meta.width, meta.height) == (360, 640) + assert meta.rotation == 90 + assert meta.nb_frames == 30 + + def test_metadata_from_bytes_reports_display_orientation(self, video_bytes): + """Bytes-based metadata must match the path-based behavior.""" + meta = video_metadata_from_bytes(video_bytes) + + assert (meta.width, meta.height) == (360, 640) + assert meta.rotation == 90 + + def test_unrotated_video_has_zero_rotation(self): + """A video without a display matrix reports rotation=0 and unchanged dims.""" + meta = video_metadata(str(Path(__file__).parent / "assets" / "example.mp4")) + + assert meta.rotation == 0 + + def test_frames_are_rotated_to_display_orientation(self, video_path): + """Decoded frames must come out portrait, not the stored landscape.""" + frames = list(read_frames_exact(video_path, 0, 4)) + + assert len(frames) == 5 + assert all(frame.shape == (640, 360, 3) for frame in frames) + # np.rot90 alone yields a non-contiguous view, which MediaPipe/OpenCV reject + assert all(frame.flags["C_CONTIGUOUS"] for frame in frames) + + def test_frames_match_ffmpeg_autorotate(self, video_path): + """Rotating the wrong way yields the same shape — compare pixels against ffmpeg.""" + frames = list(read_frames_exact(video_path, 0, 4)) + ref = ffmpeg_autorotated_frames(video_path, 5, width=360, height=640) + + for i, frame in enumerate(frames): + assert_frames_close(frame, ref[i], f"Frame {i} differs from ffmpeg") + + def test_stream_reading_rotates(self, video_bytes, video_path): + """The stream path must rotate frames and report display-oriented metadata.""" + meta, frames = read_frames_from_stream(BytesIO(video_bytes)) + ref = ffmpeg_autorotated_frames(video_path, 3, width=360, height=640) + + assert (meta.width, meta.height, meta.rotation) == (360, 640, 90) + for i in range(3): + assert_frames_close(next(frames), ref[i], f"Frame {i} differs from ffmpeg") + + def test_stream_reading_with_skip_frames(self, video_bytes, video_path): + """skip_frames must still work with the eagerly-decoded first frame.""" + ref = ffmpeg_autorotated_frames(video_path, 3, width=360, height=640) + _, frames = read_frames_from_stream(BytesIO(video_bytes), skip_frames=2) + + assert_frames_close(next(frames), ref[2], "Skipped-to frame differs from ffmpeg") + + def test_stream_reading_from_non_seekable_pipe(self, video_bytes): + """Rotation must work on a truly non-seekable stream (BytesIO can seek; a pipe cannot).""" + read_fd, write_fd = os.pipe() + read_file = os.fdopen(read_fd, "rb") + write_file = os.fdopen(write_fd, "wb") + + def writer(): + write_file.write(video_bytes) + write_file.close() + + thread = threading.Thread(target=writer) + thread.start() + try: + meta, frames = read_frames_from_stream(read_file) + frame_list = list(frames) + finally: + thread.join() + read_file.close() + + assert (meta.width, meta.height, meta.rotation) == (360, 640, 90) + assert len(frame_list) == 30 + assert all(frame.shape == (640, 360, 3) for frame in frame_list) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])