diff --git a/dali/python/nvidia/dali/experimental/torchvision/__init__.py b/dali/python/nvidia/dali/experimental/torchvision/__init__.py index 550dfd57bc5..1d8b828268f 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/__init__.py +++ b/dali/python/nvidia/dali/experimental/torchvision/__init__.py @@ -20,7 +20,7 @@ from .v2.normalize import Normalize from .v2.pad import Pad from .v2.rand_apply import RandomApply -from .v2.randomcrop import RandomCrop +from .v2.randomcrop import RandomCrop, RandomResizedCrop from .v2.resize import Resize from .v2.totensor import ToPureTensor, PILToTensor, ToPILImage @@ -37,6 +37,7 @@ "RandomCrop", "RandomGrayscale", "RandomHorizontalFlip", + "RandomResizedCrop", "RandomVerticalFlip", "Resize", "ToPILImage", diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py index c58297e2f33..b1a524e6f19 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py @@ -14,7 +14,7 @@ from .centercrop import center_crop from .color import to_grayscale, rgb_to_grayscale -from .crop import crop +from .crop import crop, resized_crop from .flips import horizontal_flip, vertical_flip from .gaussian_blur import gaussian_blur from .image_metadata import get_dimensions, get_image_size, get_size @@ -35,6 +35,7 @@ "pad", "pil_to_tensor", "resize", + "resized_crop", "rgb_to_grayscale", "to_grayscale", "to_pil_image", diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/crop.py b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/crop.py index d3529248f2c..ed5746fefd7 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/crop.py +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/crop.py @@ -13,13 +13,22 @@ # limitations under the License. import operator +from typing import List import nvidia.dali.experimental.dynamic as ndd +from torchvision.transforms import InterpolationMode + from nvidia.dali._typing import TensorLike from nvidia.dali.experimental.dynamic._device import DeviceLike from ..operator import adjust_input from ..randomcrop import RandomCrop +from ..resize import Resize + + +def _verify_crop_coordinate(value, name: str) -> None: + if not isinstance(value, int): + raise TypeError(f"{name} must be int, got {type(value)}") def _validate_integer_param(value, name: str) -> int: @@ -61,6 +70,26 @@ def _validate_crop_params(inpt, top, left, height, width) -> tuple[int, int, int ) +def _crop( + inpt: ndd.Tensor | ndd.Batch, + top: int, + left: int, + height: int, + width: int, + device: DeviceLike = "cpu", +) -> ndd.Tensor | ndd.Batch: + axes = [-3, -2] if _is_pil_image_layout(inpt) else [-2, -1] + return ndd.slice( + inpt, + (top, left), + (height, width), + axes=axes, + out_of_bounds_policy="pad", + fill_values=0, + device=device, + ) + + @adjust_input def crop( inpt: TensorLike | ndd.Batch, @@ -82,13 +111,48 @@ def crop( fill=0, ) - return ndd.slice( - inpt, - [float(left), float(top)], - [float(width), float(height)], - normalized_anchor=False, - normalized_shape=False, - out_of_bounds_policy="pad", - fill_values=0, + return _crop(inpt, top, left, height, width, device=device) + + +@adjust_input +def resized_crop( + inpt: TensorLike | ndd.Batch, + top: int, + left: int, + height: int, + width: int, + size: int | List[int], + interpolation: InterpolationMode | int = InterpolationMode.BILINEAR, + antialias: bool = True, + device: DeviceLike = "cpu", +) -> ndd.Tensor | ndd.Batch: + """ + Crop the input at location (top, left) with dimensions (height, width), + then resize the crop to the given size. + """ + top, left, height, width = _validate_crop_params(inpt, top, left, height, width) + RandomCrop.verify_args( + size=(height, width), + padding=None, + pad_if_needed=False, + padding_mode="constant", + fill=0, + ) + interpolation = Resize.normalize_interpolation(interpolation) + Resize.verify_args(size=size, max_size=None, interpolation=interpolation, antialias=antialias) + + size_normalized = Resize.infer_effective_size(size) + interpolation = Resize.interpolation_modes[interpolation] + + cropped = _crop(inpt, top, left, height, width, device=device) + target_h, target_w = Resize.calculate_target_size_dynamic_mode( + (height, width), size_normalized, None + ) + + return ndd.resize( + cropped, device=device, + size=(target_h, target_w), + interp_type=interpolation, + antialias=antialias, ) diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py b/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py index 89ef53d7113..f3c3b9b15dd 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/randomcrop.py @@ -17,6 +17,7 @@ import nvidia.dali as dali import nvidia.dali.fn as fn +from torchvision.transforms import InterpolationMode from .centercrop import CenterCrop from .operator import ( @@ -27,6 +28,7 @@ get_HWC_from_layout_pipeline, ) from .pad import PADDING_CLASS, _ValidatePaddingMode +from .resize import Resize class _ValidateCropSize(_ArgumentValidateRule): @@ -86,6 +88,41 @@ def verify(cls, *, fill, **_) -> None: raise TypeError(f"fill must be a number, sequence of numbers, or None, got {fill!r}") +class _ValidateRandomResizedCropScaleRatio(_ArgumentValidateRule): + """ + Verify RandomResizedCrop scale and ratio arguments. + """ + + @classmethod + def _verify_range(cls, value, name: str) -> None: + if not isinstance(value, (list, tuple)) or len(value) != 2: + raise TypeError(f"{name} should be a sequence of two numbers") + if any(not isinstance(elem, numbers.Number) for elem in value): + raise TypeError(f"{name} values must be numbers, got {value}") + if any(elem <= 0 for elem in value): + raise ValueError(f"{name} values must be positive, got {value}") + if value[0] > value[1]: + raise ValueError(f"{name} should be a (min, max) range, got {value}") + + @classmethod + def verify(cls, *, scale, ratio, **_) -> None: + cls._verify_range(scale, "scale") + cls._verify_range(ratio, "ratio") + + +class _ValidateRandomResizedCropInterpolation(_ArgumentValidateRule): + """ + Verify RandomResizedCrop interpolation argument. + """ + + @classmethod + def verify(cls, *, interpolation, **_) -> None: + if interpolation in Resize.not_supported_interpolation_modes: + raise NotImplementedError(f"Interpolation mode: {interpolation} is not supported") + if interpolation not in Resize.interpolation_modes: + raise ValueError(f"Interpolation {interpolation!r} is not supported") + + class RandomCrop(Operator): """ Crop the input at a random location. @@ -229,3 +266,85 @@ def _kernel(self, data_input): fn.stack(crop_w, crop_h), **slice_kwargs, ) + + +class RandomResizedCrop(Operator): + """ + Crop a random portion of the input and resize it to a given size. + + If the input is a ``torch.Tensor`` it can have an arbitrary number of leading batch dimensions. + For example, the image tensor can have [..., C, H, W] shape. + + Parameters + ---------- + size : sequence or int + Expected output size of the crop. If size is an int instead of sequence like (h, w), + a square output size (size, size) is made. If provided a sequence of length 1, it will be + interpreted as (size[0], size[0]). + scale : tuple of float, optional, default = (0.08, 1.0) + Lower and upper bounds for the random crop area, relative to the input image area. + ratio : tuple of float, optional, default = (3 / 4, 4 / 3) + Lower and upper bounds for the random crop aspect ratio, width / height. + interpolation : InterpolationMode or int, optional, default = InterpolationMode.BILINEAR + Interpolation mode to use for resizing. Legacy PIL integer codes + (``0`` = NEAREST, ``1`` = LANCZOS, ``2`` = BILINEAR, ``3`` = BICUBIC, + ``4`` = BOX, ``5`` = HAMMING) are accepted for torchvision compatibility. + antialias : bool, optional, default = True + Whether to apply antialiasing during resize. + device : Literal["cpu", "gpu"], optional, default = "cpu" + Device to use for the crop. Can be ``"cpu"`` or ``"gpu"``. + """ + + arg_rules = [ + _ValidateSizeDescriptor, + _ValidateCropSize, + _ValidateRandomResizedCropScaleRatio, + _ValidateRandomResizedCropInterpolation, + ] + preprocess_data = get_HWC_from_layout_pipeline + + @classmethod + def adjust_size(cls, size: int | Sequence[int]) -> Sequence[int]: + return CenterCrop.adjust_size(size) + + def __init__( + self, + size: int | Sequence[int], + scale: tuple[float, float] = (0.08, 1.0), + ratio: tuple[float, float] = (3.0 / 4.0, 4.0 / 3.0), + interpolation: InterpolationMode | int = InterpolationMode.BILINEAR, + antialias: bool | None = True, + device: Literal["cpu", "gpu"] = "cpu", + ): + interpolation = Resize.normalize_interpolation(interpolation) + + super().__init__( + device=device, + size=size, + scale=scale, + ratio=ratio, + interpolation=interpolation, + ) + + self.size = RandomResizedCrop.adjust_size(size) + self.scale = tuple(scale) + self.ratio = tuple(ratio) + self.interpolation = Resize.interpolation_modes[interpolation] + self.antialias = antialias + + def _kernel(self, data_input): + """ + Applies random resized crop to the input data. + """ + _, _, _, tensor = data_input + + return fn.random_resized_crop( + tensor, + device=self.device, + size=self.size, + random_area=self.scale, + random_aspect_ratio=self.ratio, + interp_type=self.interpolation, + antialias=self.antialias, + num_attempts=10, + ) diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/resize.py b/dali/python/nvidia/dali/experimental/torchvision/v2/resize.py index c1723648178..1347e5376b9 100644 --- a/dali/python/nvidia/dali/experimental/torchvision/v2/resize.py +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/resize.py @@ -44,6 +44,16 @@ def verify(cls, *, size, max_size, interpolation, **_): edge, i.e. size should be an int" ) + if isinstance(size, int) and size <= 0: + raise ValueError(f"size must be positive, got {size}") + if isinstance(size, (tuple, list)): + if len(size) not in (1, 2): + raise ValueError(f"size sequence must have length 1 or 2, got {len(size)}") + if any(not isinstance(s, int) for s in size): + raise ValueError(f"size values must be integers, got {size}") + if any(s <= 0 for s in size): + raise ValueError(f"size values must be positive, got {size}") + if interpolation in Resize.not_supported_interpolation_modes: raise NotImplementedError(f"Interpolation mode: {interpolation} is not supported") @@ -98,9 +108,32 @@ class Resize(Operator): InterpolationMode.HAMMING, ] + # Legacy PIL integer codes accepted by torchvision for back-compat + # (mirrors torchvision.transforms.functional._interpolation_modes_from_int). + int_to_interpolation_mode = { + 0: InterpolationMode.NEAREST, + 1: InterpolationMode.LANCZOS, + 2: InterpolationMode.BILINEAR, + 3: InterpolationMode.BICUBIC, + 4: InterpolationMode.BOX, + 5: InterpolationMode.HAMMING, + } + arg_rules = [_ValidateSize] preprocess_data = get_HWC_from_layout_pipeline + @classmethod + def normalize_interpolation(cls, interpolation): + if isinstance(interpolation, int) and not isinstance(interpolation, InterpolationMode): + try: + return cls.int_to_interpolation_mode[interpolation] + except KeyError: + raise ValueError( + f"Interpolation int {interpolation} is not a valid PIL code; " + f"expected one of {sorted(cls.int_to_interpolation_mode)}" + ) + return interpolation + @classmethod def infer_effective_size( cls, @@ -228,6 +261,7 @@ def __init__( antialias: Optional[bool] = True, device: Literal["cpu", "gpu"] = "cpu", ): + interpolation = Resize.normalize_interpolation(interpolation) super().__init__( device=device, diff --git a/dali/test/python/torchvision/test_tv_crop.py b/dali/test/python/torchvision/test_tv_crop.py index 51a31d9ebfa..e82aa569e01 100644 --- a/dali/test/python/torchvision/test_tv_crop.py +++ b/dali/test/python/torchvision/test_tv_crop.py @@ -130,9 +130,9 @@ def test_crop_preserves_tensor_dtype(dtype): dict(top=0, left=0, height=1, width=1.0), ) def test_crop_tensor_rejects_float_parameters(crop_kwargs): - with assert_raises(TypeError): + with assert_raises(TypeError, glob="*integer*"): _ = tv_fn.crop(make_test_tensor(), **crop_kwargs) - with assert_raises(TypeError): + with assert_raises(TypeError, glob="*integer*"): _ = crop(make_test_tensor(), **crop_kwargs) @@ -144,16 +144,16 @@ def test_crop_tensor_rejects_float_parameters(crop_kwargs): ) def test_crop_pil_rejects_non_numeric_parameters(crop_kwargs): pil_image = _make_pil_image("RGB") - with assert_raises(TypeError): + with assert_raises(TypeError, glob="*str*"): _ = tv_fn.crop(pil_image, **crop_kwargs) - with assert_raises(TypeError): + with assert_raises(TypeError, glob="*real numbers*"): _ = crop(pil_image, **crop_kwargs) def test_crop_invalid_input_type(): - with assert_raises(TypeError): + with assert_raises(TypeError, glob="*support*"): _ = tv_fn.crop([1, 2, 3], top=0, left=0, height=1, width=1) - with assert_raises(TypeError): + with assert_raises(TypeError, glob="*support*"): _ = crop([1, 2, 3], top=0, left=0, height=1, width=1) @@ -166,7 +166,7 @@ def test_crop_invalid_input_type(): (1, 1.0), ) def test_crop_invalid_output_size(height, width): - with assert_raises((TypeError, ValueError)): + with assert_raises((TypeError, ValueError), glob="*must be*"): _ = crop(make_test_tensor(), top=0, left=0, height=height, width=width) @@ -177,7 +177,7 @@ def test_crop_invalid_output_size(height, width): (0, "0"), ) def test_crop_invalid_coordinates(top, left): - with assert_raises(TypeError): + with assert_raises(TypeError, glob="*int*"): _ = tv_fn.crop(make_test_tensor(), top=top, left=left, height=1, width=1) - with assert_raises(TypeError): + with assert_raises(TypeError, glob="*int*"): _ = crop(make_test_tensor(), top=top, left=left, height=1, width=1) diff --git a/dali/test/python/torchvision/test_tv_randomresizedcrop.py b/dali/test/python/torchvision/test_tv_randomresizedcrop.py new file mode 100644 index 00000000000..12d57f8733f --- /dev/null +++ b/dali/test/python/torchvision/test_tv_randomresizedcrop.py @@ -0,0 +1,313 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import unittest + +from nose2.tools import cartesian_params, params +from nose_utils import assert_raises +import numpy as np +from PIL import Image +import torch +import torchvision.transforms.v2 as transforms +import torchvision.transforms.v2.functional as tv_fn + +import nvidia.dali.experimental.torchvision.v2.randomcrop as randomcrop_module +from nvidia.dali.experimental.torchvision import Compose, RandomResizedCrop +from nvidia.dali.experimental.torchvision.v2.operator import Operator +from nvidia.dali.experimental.torchvision.v2.resize import Resize + + +def make_tensor(shape=(3, 8, 10), dtype=torch.uint8): + return torch.arange(math.prod(shape), dtype=dtype).reshape(shape) + + +def make_pil_image(mode="RGB", h=8, w=10, seed=42): + rng = np.random.default_rng(seed) + if mode == "L": + data = rng.integers(0, 256, (h, w), dtype=np.uint8) + elif mode == "RGB": + data = rng.integers(0, 256, (h, w, 3), dtype=np.uint8) + elif mode == "RGBA": + data = rng.integers(0, 256, (h, w, 4), dtype=np.uint8) + else: + raise ValueError(f"Unsupported mode: {mode}") + return Image.fromarray(data, mode=mode) + + +def _to_tensor(inpt): + if isinstance(inpt, Image.Image): + return tv_fn.pil_to_tensor(inpt) + return inpt + + +def _build_dali_random_resized_crop(**kwargs): + batch_size = kwargs.pop("batch_size", 1) + return Compose([RandomResizedCrop(**kwargs)], batch_size=batch_size) + + +def _skip_if_gpu_unavailable(device): + if device == "gpu" and not torch.cuda.is_available(): + raise unittest.SkipTest("CUDA is not available") + + +def _move_tensor_to_device(inpt, device): + if device == "gpu" and isinstance(inpt, torch.Tensor): + return inpt.cuda() + return inpt + + +def _assert_allclose_to_torchvision(inpt, dali_transform, tv_transform, device="cpu", atol=1): + out = dali_transform(inpt) + tv_out = tv_transform(inpt) + + out = _to_tensor(out) + tv_out = _to_tensor(tv_out) + if device == "gpu": + out = out.cpu() + if isinstance(tv_out, torch.Tensor): + tv_out = tv_out.cpu() + + assert out.shape == tv_out.shape, f"Shape mismatch: {out.shape} != {tv_out.shape}" + assert torch.allclose(out, tv_out, rtol=0, atol=atol), "DALI output differs from torchvision" + + +def test_random_resized_crop_is_operator(): + assert issubclass(RandomResizedCrop, Operator) + + +def test_random_resized_crop_exported_from_randomcrop_module(): + from nvidia.dali.experimental.torchvision.v2.randomcrop import RandomResizedCrop as exported + + assert RandomResizedCrop is exported + + +def test_random_resized_crop_uses_dali_operator(): + transform = RandomResizedCrop( + size=(4, 5), + scale=(0.5, 1.0), + ratio=(0.75, 1.25), + interpolation=transforms.InterpolationMode.NEAREST, + antialias=False, + ) + calls = [] + + def fake_random_resized_crop(tensor, **kwargs): + calls.append((tensor, kwargs)) + return "cropped" + + old_random_resized_crop = randomcrop_module.fn.random_resized_crop + try: + randomcrop_module.fn.random_resized_crop = fake_random_resized_crop + out = transform._kernel((8, 10, 3, "input")) + finally: + randomcrop_module.fn.random_resized_crop = old_random_resized_crop + + assert out == "cropped" + assert len(calls) == 1 + tensor, kwargs = calls[0] + assert tensor == "input" + assert kwargs["size"] == (4, 5) + assert kwargs["random_area"] == (0.5, 1.0) + assert kwargs["random_aspect_ratio"] == (0.75, 1.25) + assert kwargs["interp_type"] == Resize.interpolation_modes[transforms.InterpolationMode.NEAREST] + assert kwargs["antialias"] is False + assert kwargs["num_attempts"] == 10 + + +@cartesian_params( + ("cpu", "gpu"), + ( + ("tensor", (3, 8, 10)), + ("tensor", (4, 3, 8, 10)), + ("pil", "L"), + ("pil", "RGB"), + ("pil", "RGBA"), + ), +) +def test_random_resized_crop_identity_matches_torchvision(device, input_case): + _skip_if_gpu_unavailable(device) + input_type, input_arg = input_case + inpt = make_pil_image(input_arg) if input_type == "pil" else make_tensor(shape=input_arg) + inpt = _move_tensor_to_device(inpt, device) + batch_size = inpt.shape[0] if isinstance(inpt, torch.Tensor) and inpt.ndim > 3 else 1 + + kwargs = { + "size": (8, 10), + "scale": (1.0, 1.0), + "ratio": (10.0 / 8.0, 10.0 / 8.0), + "interpolation": transforms.InterpolationMode.NEAREST, + "antialias": False, + } + + _assert_allclose_to_torchvision( + inpt, + _build_dali_random_resized_crop(**kwargs, device=device, batch_size=batch_size), + transforms.RandomResizedCrop(**kwargs), + device=device, + atol=0, + ) + + +@cartesian_params( + ("cpu", "gpu"), + ( + (4, (4, 4)), + ([4], (4, 4)), + ([4, 5], (4, 5)), + ((5, 4), (5, 4)), + ), +) +def test_random_resized_crop_tensor_shape(device, shape_case): + _skip_if_gpu_unavailable(device) + size, expected_hw = shape_case + tensor = _move_tensor_to_device(make_tensor(), device) + out = _build_dali_random_resized_crop( + size=size, + scale=(0.5, 1.0), + ratio=(0.75, 1.3333333333333333), + device=device, + )(tensor) + + assert out.shape == (3, *expected_hw) + + +@cartesian_params(("cpu", "gpu")) +def test_random_resized_crop_batched_tensor_shape(device): + _skip_if_gpu_unavailable(device) + tensor = _move_tensor_to_device(make_tensor(shape=(4, 3, 8, 10)), device) + out = _build_dali_random_resized_crop( + size=(4, 5), + scale=(0.5, 1.0), + ratio=(0.75, 1.3333333333333333), + device=device, + batch_size=4, + )(tensor) + + assert out.shape == (4, 3, 4, 5) + + +@cartesian_params(("cpu", "gpu")) +def test_random_resized_crop_samples_different_crops(device): + _skip_if_gpu_unavailable(device) + tensor = _move_tensor_to_device(make_tensor(shape=(3, 32, 40)), device) + transform = _build_dali_random_resized_crop( + size=(8, 10), + scale=(0.2, 0.8), + ratio=(0.75, 1.3333333333333333), + interpolation=transforms.InterpolationMode.NEAREST, + antialias=False, + device=device, + ) + + outputs = {bytes(transform(tensor).cpu().numpy().tobytes()) for _ in range(20)} + + assert len(outputs) > 1, "RandomResizedCrop produced the same crop for every run" + + +@cartesian_params( + ( + transforms.InterpolationMode.NEAREST, + transforms.InterpolationMode.BILINEAR, + transforms.InterpolationMode.BICUBIC, + ), + ("cpu", "gpu"), +) +def test_random_resized_crop_interpolation_shape(interpolation, device): + _skip_if_gpu_unavailable(device) + tensor = _move_tensor_to_device(make_tensor(), device) + out = _build_dali_random_resized_crop( + size=(4, 5), + scale=(1.0, 1.0), + ratio=(10.0 / 8.0, 10.0 / 8.0), + interpolation=interpolation, + antialias=False, + device=device, + )(tensor) + + assert out.shape == (3, 4, 5) + + +def test_random_resized_crop_unsupported_interpolation(): + with assert_raises(NotImplementedError, glob="*Interpolation mode*"): + _ = RandomResizedCrop(size=3, interpolation=transforms.InterpolationMode.NEAREST_EXACT) + + +@cartesian_params((True, False), ("cpu", "gpu")) +def test_random_resized_crop_antialias_shape(antialias, device): + _skip_if_gpu_unavailable(device) + tensor = _move_tensor_to_device(make_tensor(), device) + out = _build_dali_random_resized_crop( + size=(4, 5), + scale=(1.0, 1.0), + ratio=(10.0 / 8.0, 10.0 / 8.0), + interpolation=transforms.InterpolationMode.BILINEAR, + antialias=antialias, + device=device, + )(tensor) + + assert out.shape == (3, 4, 5) + + +@params( + [], + [0, 5], + [5, 0], + [1.0, 2], + [1, 2, 3], + -1, + 1.0, + {"bad": "value"}, +) +def test_random_resized_crop_invalid_size(size): + with assert_raises((TypeError, ValueError), glob="*size*"): + _ = RandomResizedCrop(size=size) + + +@params( + ("scale", object()), + ("scale", "bad"), + ("scale", [1]), + ("scale", [1, 2, 3]), + ("scale", [1, object()]), + ("scale", [-1, 1]), + ("scale", [1, 0.5]), + ("ratio", object()), + ("ratio", "bad"), + ("ratio", [1]), + ("ratio", [1, 2, 3]), + ("ratio", [1, object()]), + ("ratio", [0, 1]), + ("ratio", [1, 0.5]), +) +def test_random_resized_crop_invalid_scale_ratio(name, value): + kwargs = {name: value} + with assert_raises((TypeError, ValueError), glob=f"*{name}*"): + _ = RandomResizedCrop(size=3, **kwargs) + + +def test_random_resized_crop_invalid_interpolation(): + with assert_raises(ValueError, glob="*Interpolation*"): + _ = RandomResizedCrop(size=3, interpolation="bad") + + +def test_random_resized_crop_int_interpolation_normalizes_to_enum(): + transform = RandomResizedCrop(size=(4, 5), interpolation=2) + expected = Resize.interpolation_modes[transforms.InterpolationMode.BILINEAR] + assert transform.interpolation == expected + + +def test_random_resized_crop_invalid_int_interpolation(): + with assert_raises(ValueError, glob="*PIL code*"): + _ = RandomResizedCrop(size=3, interpolation=99) diff --git a/dali/test/python/torchvision/test_tv_resized_crop.py b/dali/test/python/torchvision/test_tv_resized_crop.py new file mode 100644 index 00000000000..6f3f0094f5c --- /dev/null +++ b/dali/test/python/torchvision/test_tv_resized_crop.py @@ -0,0 +1,333 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import List, Literal + +import torch +from nose2.tools import cartesian_params, params +from nose_utils import assert_raises +from PIL import Image +import torchvision.transforms.v2 as transforms +import torchvision.transforms.v2.functional as fn_tv + +import nvidia.dali.experimental.torchvision.v2.functional as fn_dali + +dali_extra = os.environ["DALI_EXTRA_PATH"] +jpeg = os.path.join(dali_extra, "db", "single", "jpeg") +jpeg_113 = os.path.join(jpeg, "113") +test_files = [ + os.path.join(jpeg_113, f) + for f in ["snail-4291306_1280.jpg", "snail-4345504_1280.jpg", "snail-4368154_1280.jpg"] +] + + +def _run_one( + inpt, + top: int, + left: int, + height: int, + width: int, + size: int | List[int], + interpolation: transforms.InterpolationMode = transforms.InterpolationMode.BILINEAR, + antialias: bool = True, + device: Literal["cpu", "gpu"] = "cpu", +): + """Run torchvision and DALI on the same input and assert shape / pixel agreement.""" + out_tv = fn_tv.resized_crop( + inpt, top, left, height, width, size, interpolation=interpolation, antialias=antialias + ) + out_dali = fn_dali.resized_crop( + inpt, + top, + left, + height, + width, + size, + interpolation=interpolation, + antialias=antialias, + device=device, + ) + + if isinstance(inpt, Image.Image): + # Shape comparison only — PIL round-trips introduce additional rounding + out_tv_t = transforms.functional.pil_to_tensor(out_tv) + out_dali_t = transforms.functional.pil_to_tensor(out_dali) + assert ( + out_tv_t.shape == out_dali_t.shape + ), f"Shape mismatch: tv={out_tv_t.shape} dali={out_dali_t.shape}" + else: + if out_tv.device != out_dali.device: + out_dali = out_dali.to(out_tv.device) + assert torch.allclose( + out_tv, out_dali, rtol=0, atol=1 + ), f"Pixel mismatch: max diff={(out_tv.int() - out_dali.int()).abs().max().item()}" + + +def loop_images_test( + top: int, + left: int, + height: int, + width: int, + size: int | List[int], + interpolation: transforms.InterpolationMode = transforms.InterpolationMode.BILINEAR, + antialias: bool = True, + device: Literal["cpu", "gpu"] = "cpu", +): + for filepath in test_files: + img = Image.open(filepath) + _run_one(img, top, left, height, width, size, interpolation, antialias, device) + + +def build_tensors(h: int = 256, w: int = 320, channels: int = 3): + """Return a variety of CHW and NCHW tensors with the given spatial dimensions.""" + return [ + torch.randint(0, 256, (channels, h, w), dtype=torch.uint8), + torch.randint(0, 256, (1, channels, h, w), dtype=torch.uint8), + torch.randint(0, 256, (4, channels, h, w), dtype=torch.uint8), + ] + + +def loop_tensors_test( + top: int, + left: int, + height: int, + width: int, + size: int | List[int], + interpolation: transforms.InterpolationMode = transforms.InterpolationMode.BILINEAR, + antialias: bool = False, + device: Literal["cpu", "gpu"] = "cpu", + tensor_h: int = 256, + tensor_w: int = 320, + channels: int = 3, +): + # antialias=False by default: antialiased downscaling differs between DALI and + # torchvision filter implementations and is not expected to be pixel-exact. + for tn in build_tensors(h=tensor_h, w=tensor_w, channels=channels): + _run_one(tn, top, left, height, width, size, interpolation, antialias, device) + + +# --------------------------------------------------------------------------- +# Output size variants +# --------------------------------------------------------------------------- + + +@cartesian_params((112, 224, [224, 224], [112, 336]), ("cpu", "gpu")) +def test_resized_crop_sizes_images(size, device): + # Crop a 400×600 window from the top-left area, then resize + loop_images_test(top=10, left=10, height=400, width=600, size=size, device=device) + + +@cartesian_params((64, 128, [128, 128], [64, 96]), ("cpu", "gpu")) +def test_resized_crop_sizes_tensors(size, device): + # Tensors are 256×320; crop a 128×160 window from (32, 40) + loop_tensors_test(top=32, left=40, height=128, width=160, size=size, device=device) + + +# --------------------------------------------------------------------------- +# Crop position and window variants +# --------------------------------------------------------------------------- + + +@cartesian_params( + ((0, 0, 200, 200), (50, 100, 300, 400), (10, 10, 400, 600)), + ("cpu", "gpu"), +) +def test_resized_crop_crop_positions_images(crop, device): + top, left, height, width = crop + loop_images_test(top=top, left=left, height=height, width=width, size=224, device=device) + + +@cartesian_params( + ((0, 0, 100, 100), (10, 20, 128, 160), (50, 50, 64, 64)), + ("cpu", "gpu"), +) +def test_resized_crop_crop_positions_tensors(crop, device): + top, left, height, width = crop + loop_tensors_test(top=top, left=left, height=height, width=width, size=128, device=device) + + +@cartesian_params( + ((-1, -2, 6, 8, [6, 8]), (6, 8, 5, 6, [5, 6]), (0, 0, 12, 14, [12, 14])), + ("cpu", "gpu"), +) +def test_resized_crop_crop_padding_tensors(crop_case, device): + top, left, height, width, size = crop_case + loop_tensors_test( + top=top, + left=left, + height=height, + width=width, + size=size, + interpolation=transforms.InterpolationMode.NEAREST, + antialias=False, + device=device, + tensor_h=8, + tensor_w=10, + ) + + +# --------------------------------------------------------------------------- +# Interpolation modes +# --------------------------------------------------------------------------- + + +@cartesian_params( + ( + transforms.InterpolationMode.NEAREST, + transforms.InterpolationMode.NEAREST_EXACT, + transforms.InterpolationMode.BILINEAR, + transforms.InterpolationMode.BICUBIC, + ), + ("cpu", "gpu"), +) +def test_resized_crop_interpolation(interpolation, device): + if interpolation == transforms.InterpolationMode.NEAREST_EXACT: + with assert_raises(NotImplementedError, glob="*Interpolation mode*"): + loop_images_test( + top=10, + left=10, + height=400, + width=600, + size=224, + interpolation=interpolation, + device=device, + ) + else: + loop_images_test( + top=10, + left=10, + height=400, + width=600, + size=224, + interpolation=interpolation, + device=device, + ) + + +# --------------------------------------------------------------------------- +# Antialias +# --------------------------------------------------------------------------- + + +@cartesian_params((True, False), ("cpu", "gpu")) +def test_resized_crop_antialias_images(antialias, device): + loop_images_test( + top=10, left=10, height=400, width=600, size=224, antialias=antialias, device=device + ) + + +@cartesian_params((True, False), ("cpu", "gpu")) +def test_resized_crop_antialias_tensors(antialias, device): + # Antialiased downscaling differs between DALI and torchvision filter implementations, + # so only assert shape agreement (not pixel values) — same approach as test_tv_resize.py. + out_tv = fn_tv.resized_crop(build_tensors()[0], 32, 40, 128, 160, 128, antialias=antialias) + out_dali = fn_dali.resized_crop( + build_tensors()[0], 32, 40, 128, 160, 128, antialias=antialias, device=device + ) + out_dali = out_dali.cpu() + assert ( + out_tv.shape == out_dali.shape + ), f"Shape mismatch: tv={out_tv.shape} dali={out_dali.shape}" + + +# --------------------------------------------------------------------------- +# Validation and exports +# --------------------------------------------------------------------------- + + +def test_resized_crop_exported_from_crop_module(): + from nvidia.dali.experimental.torchvision.v2.functional.crop import resized_crop + + assert fn_dali.resized_crop is resized_crop + + +@params( + (0.5, 0), + ("0", 0), + (0, 0.5), + (0, "0"), +) +def test_resized_crop_invalid_coordinates(top, left): + with assert_raises(TypeError, glob="*must be an integer*"): + _ = fn_dali.resized_crop( + build_tensors(h=8, w=10)[0], top=top, left=left, height=1, width=1, size=1 + ) + + +@params( + (0, 1), + (1, 0), + (-1, 1), + (1, -1), + (1.0, 1), + (1, 1.0), +) +def test_resized_crop_invalid_crop_size(height, width): + with assert_raises((TypeError, ValueError), glob="*must be*"): + _ = fn_dali.resized_crop( + build_tensors(h=8, w=10)[0], top=0, left=0, height=height, width=width, size=1 + ) + + +@params( + [], + [0, 5], + [5, 0], + [1.0, 2], + [1, 2, 3], + -1, + 0, + 1.0, + {"bad": "value"}, +) +def test_resized_crop_invalid_size(size): + with assert_raises((TypeError, ValueError), glob="*size*"): + _ = fn_dali.resized_crop( + build_tensors(h=8, w=10)[0], top=0, left=0, height=4, width=4, size=size + ) + + +def test_resized_crop_invalid_interpolation(): + with assert_raises(ValueError, glob="*Interpolation*"): + _ = fn_dali.resized_crop( + build_tensors(h=8, w=10)[0], + top=0, + left=0, + height=4, + width=4, + size=2, + interpolation="bad", + ) + + +def test_resized_crop_invalid_int_interpolation(): + with assert_raises(ValueError, glob="*PIL code*"): + _ = fn_dali.resized_crop( + build_tensors(h=8, w=10)[0], + top=0, + left=0, + height=4, + width=4, + size=2, + interpolation=99, + ) + + +@params([1, 2, 3], "not a tensor", 42) +def test_resized_crop_invalid_input_type(inpt): + with assert_raises(TypeError, glob="*support*"): + _ = fn_tv.resized_crop(inpt, top=0, left=0, height=1, width=1, size=1) + with assert_raises(TypeError, glob="*support*"): + _ = fn_dali.resized_crop(inpt, top=0, left=0, height=1, width=1, size=1)