diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py index 18003740b00..6064709cd61 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py @@ -17,6 +17,7 @@ from .crop import crop from .flips import horizontal_flip, vertical_flip from .gaussian_blur import gaussian_blur +from .image_metadata import get_dimensions, get_image_size from .normalize import normalize from .pad import pad from .resize import resize @@ -26,6 +27,8 @@ "center_crop", "crop", "gaussian_blur", + "get_dimensions", + "get_image_size", "horizontal_flip", "normalize", "pad", diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/image_metadata.py b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/image_metadata.py new file mode 100644 index 00000000000..9ec4c85891d --- /dev/null +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/image_metadata.py @@ -0,0 +1,82 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +from PIL import Image +import torch + + +def get_image_size(inpt: Image.Image | torch.Tensor) -> List[int]: + """ + Return the spatial size of an image as ``[width, height]``. + + Mirrors ``torchvision.transforms.v2.functional.get_image_size``. + + .. note:: + This function is provided for compatibility. The torchvision successor + ``get_size`` returns ``[height, width]`` instead. + + Parameters + ---------- + inpt : PIL Image or torch.Tensor + Input image. Tensors are expected in ``[…, H, W]`` layout (leading + channel / batch dimensions are ignored). + + Returns + ------- + List[int] + ``[width, height]`` + """ + if isinstance(inpt, Image.Image): + return list(inpt.size) # PIL .size is (W, H) + elif isinstance(inpt, torch.Tensor): + if inpt.ndim < 2: + raise TypeError( + f"get_image_size requires a tensor with at least 2 dimensions, got {inpt.ndim}." + ) + return [inpt.shape[-1], inpt.shape[-2]] # [W, H] + raise TypeError(f"Unsupported input type: {type(inpt)}.") + + +def get_dimensions(inpt: Image.Image | torch.Tensor) -> List[int]: + """ + Return the number of channels, height, and width of an image as + ``[channels, height, width]``. + + Mirrors ``torchvision.transforms.v2.functional.get_dimensions``. + + Parameters + ---------- + inpt : PIL Image or torch.Tensor + Input image. Tensors are expected in ``[H, W]`` or ``[…, C, H, W]`` layout + (leading batch dimensions are ignored). + + Returns + ------- + List[int] + ``[channels, height, width]`` + """ + if isinstance(inpt, Image.Image): + w, h = inpt.size + return [len(inpt.getbands()), h, w] + elif isinstance(inpt, torch.Tensor): + if inpt.ndim < 2: + raise TypeError( + f"get_dimensions requires a tensor with at least 2 dimensions, got {inpt.ndim}." + ) + if inpt.ndim == 2: + return [1, inpt.shape[-2], inpt.shape[-1]] + return [inpt.shape[-3], inpt.shape[-2], inpt.shape[-1]] # [C, H, W] + raise TypeError(f"Unsupported input type: {type(inpt)}.") diff --git a/dali/test/python/torchvision/test_tv_image_metadata.py b/dali/test/python/torchvision/test_tv_image_metadata.py new file mode 100644 index 00000000000..3f24ebcd359 --- /dev/null +++ b/dali/test/python/torchvision/test_tv_image_metadata.py @@ -0,0 +1,186 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +import unittest + +from nose2.tools import cartesian_params, params +from nose_utils import assert_raises +from PIL import Image +import torch +from torchvision import tv_tensors +import torchvision.transforms.v2.functional as fn_tv + +from nvidia.dali.experimental.torchvision.v2.functional import get_image_size, get_dimensions + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _tv_get_image_size(inpt): + """Call torchvision get_image_size while suppressing its deprecation warning.""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + return fn_tv.get_image_size(inpt) + + +def _skip_if_gpu_unavailable(device): + if device == "gpu" and not torch.cuda.is_available(): + raise unittest.SkipTest("CUDA is not available") + + +def _move_tensor_to_device(tensor, device): + if device == "gpu": + return tensor.cuda() + return tensor + + +def _make_compatibility_input(input_kind, shape): + tensor = torch.zeros(*shape) + if input_kind == "tensor": + return tensor + if input_kind == "tv_image": + return tv_tensors.Image(tensor) + raise ValueError(f"Unsupported input kind: {input_kind}") + + +# PIL images with known exact dimensions (W x H) +PIL_CASES = [ + Image.new("RGB", (320, 240)), # 3 channels + Image.new("L", (100, 50)), # 1 channel, non-square + Image.new("RGBA", (64, 32)), # 4 channels + Image.new("RGB", (1, 1)), # minimal + Image.new("L", (512, 1)), # extreme aspect ratio +] + +# Tensors in CHW / NCHW layout — deliberately use H≠W to catch W/H swap bugs +TENSOR_CASES = [ + torch.zeros(3, 240, 320), # CHW + torch.zeros(1, 3, 240, 320), # NCHW, N=1 + torch.zeros(8, 3, 240, 320), # NCHW, N=8 + torch.zeros(1, 50, 100), # CHW, 1 channel + torch.zeros(4, 32, 64), # CHW, 4 channels + torch.zeros(10, 11, 12, 8, 3, 240, 320), # ...NCHW, N=8 +] + +TORCHVISION_COMPATIBILITY_CASES = [ + ("tensor", (240, 320)), # HW, implicit single channel + ("tensor", (3, 240, 320)), # CHW + ("tensor", (8, 3, 240, 320)), # NCHW + ("tv_image", (240, 320)), # torchvision Image converts HW to 1HW + ("tv_image", (3, 240, 320)), # torchvision Image, CHW +] + + +# --------------------------------------------------------------------------- +# get_image_size — PIL +# --------------------------------------------------------------------------- + + +@params(*PIL_CASES) +def test_get_image_size_pil(img): + expected = _tv_get_image_size(img) + assert ( + get_image_size(img) == expected + ), f"mode={img.mode} size={img.size}: got {get_image_size(img)}, expected {expected}" + + +# --------------------------------------------------------------------------- +# get_image_size — tensors +# --------------------------------------------------------------------------- + + +@cartesian_params(("cpu", "gpu"), TENSOR_CASES) +def test_get_image_size_tensor(device, t): + _skip_if_gpu_unavailable(device) + t = _move_tensor_to_device(t, device) + expected = _tv_get_image_size(t) + assert ( + get_image_size(t) == expected + ), f"device={device} shape={t.shape}: got {get_image_size(t)}, expected {expected}" + + +# --------------------------------------------------------------------------- +# get_dimensions — PIL +# --------------------------------------------------------------------------- + + +@params(*PIL_CASES) +def test_get_dimensions_pil(img): + expected = fn_tv.get_dimensions(img) + assert ( + get_dimensions(img) == expected + ), f"mode={img.mode} size={img.size}: got {get_dimensions(img)}, expected {expected}" + + +# --------------------------------------------------------------------------- +# get_dimensions — tensors +# --------------------------------------------------------------------------- + + +@cartesian_params(("cpu", "gpu"), TENSOR_CASES) +def test_get_dimensions_tensor(device, t): + _skip_if_gpu_unavailable(device) + t = _move_tensor_to_device(t, device) + expected = fn_tv.get_dimensions(t) + assert ( + get_dimensions(t) == expected + ), f"device={device} shape={t.shape}: got {get_dimensions(t)}, expected {expected}" + + +# --------------------------------------------------------------------------- +# Torchvision compatibility +# --------------------------------------------------------------------------- + + +@params(*PIL_CASES) +def test_image_metadata_pil_matches_torchvision(img): + assert get_image_size(img) == _tv_get_image_size(img) + assert get_dimensions(img) == fn_tv.get_dimensions(img) + + +@cartesian_params(("cpu", "gpu"), TORCHVISION_COMPATIBILITY_CASES) +def test_image_metadata_tensor_inputs_match_torchvision(device, input_case): + _skip_if_gpu_unavailable(device) + input_kind, shape = input_case + inpt = _move_tensor_to_device(_make_compatibility_input(input_kind, shape), device) + + assert get_image_size(inpt) == _tv_get_image_size(inpt) + assert get_dimensions(inpt) == fn_tv.get_dimensions(inpt) + + +# --------------------------------------------------------------------------- +# Error cases +# --------------------------------------------------------------------------- + + +def test_get_image_size_1d_tensor_raises(): + with assert_raises(TypeError): + get_image_size(torch.zeros(10)) + + +def test_get_dimensions_1d_tensor_raises(): + with assert_raises(TypeError): + get_dimensions(torch.zeros(10)) + + +def test_get_image_size_unsupported_type_raises(): + with assert_raises(TypeError): + get_image_size("not_an_image") + + +def test_get_dimensions_unsupported_type_raises(): + with assert_raises(TypeError): + get_dimensions("not_an_image")