From 339f538394372e854aa19a1bdfc2de2fb4006ea3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 Feb 2023 15:14:13 +0000 Subject: [PATCH 1/3] Added docs for v2 transforms (part 1) --- docs/source/conf.py | 2 + docs/source/transforms.rst | 40 ++ torchvision/transforms/v2/_augment.py | 31 ++ torchvision/transforms/v2/_auto_augment.py | 80 ++++ torchvision/transforms/v2/_color.py | 139 +++++++ torchvision/transforms/v2/_container.py | 65 ++++ torchvision/transforms/v2/_deprecated.py | 25 ++ torchvision/transforms/v2/_geometry.py | 367 +++++++++++++++++- torchvision/transforms/v2/_meta.py | 21 + torchvision/transforms/v2/_misc.py | 69 ++++ torchvision/transforms/v2/_type_conversion.py | 30 ++ 11 files changed, 848 insertions(+), 21 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 72c83d7893d..304a1cc6e22 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -33,6 +33,8 @@ sys.path.append(os.path.abspath(".")) +torchvision.disable_beta_transforms_warning() + # -- General configuration ------------------------------------------------ # Required version of sphinx is set from docs/requirements.txt diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index d831b81e37f..00d929d0675 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -98,17 +98,29 @@ Geometry :template: class.rst Resize + v2.Resize RandomCrop + v2.RandomCrop RandomResizedCrop + v2.RandomResizedCrop CenterCrop + v2.CenterCrop FiveCrop + v2.FiveCrop TenCrop + v2.TenCrop Pad + v2.Pad RandomAffine + v2.RandomAffine RandomPerspective + v2.RandomPerspective RandomRotation + v2.RandomRotation RandomHorizontalFlip + v2.RandomHorizontalFlip RandomVerticalFlip + v2.RandomVerticalFlip Color ----- @@ -118,15 +130,25 @@ Color :template: class.rst ColorJitter + v2.ColorJitter Grayscale + v2.Grayscale RandomGrayscale + v2.RandomGrayscale GaussianBlur + v2.GaussianBlur RandomInvert + v2.RandomInvert RandomPosterize + v2.RandomPosterize RandomSolarize + v2.RandomSolarize RandomAdjustSharpness + v2.RandomAdjustSharpness RandomAutocontrast + v2.RandomAutocontrast RandomEqualize + v2.RandomEqualize Composition ----------- @@ -136,9 +158,13 @@ Composition :template: class.rst Compose + v2.Compose RandomApply + v2.RandomApply RandomChoice + v2.RandomChoice RandomOrder + v2.RandomOrder Miscellaneous ------------- @@ -148,9 +174,13 @@ Miscellaneous :template: class.rst LinearTransformation + v2.LinearTransformation Normalize + v2.Normalize RandomErasing + v2.RandomErasing Lambda + v2.Lambda .. _conversion_transforms: @@ -162,9 +192,15 @@ Conversion :template: class.rst ToPILImage + v2.ToPILImage + v2.ToImagePIL ToTensor + v2.ToTensor PILToTensor + v2.PILToTensor ConvertImageDtype + v2.ConvertImageDtype + v2.ConvertDtype Auto-Augmentation ----------------- @@ -181,9 +217,13 @@ The new transform can be used standalone or mixed-and-matched with existing tran AutoAugmentPolicy AutoAugment + v2.AutoAugment RandAugment + v2.RandAugment TrivialAugmentWide + v2.TrivialAugmentWide AugMix + v2.AugMix .. _functional_transforms: diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py index 157605d6f3c..405fac15910 100644 --- a/torchvision/transforms/v2/_augment.py +++ b/torchvision/transforms/v2/_augment.py @@ -12,7 +12,38 @@ from .utils import is_simple_tensor, query_chw +# TODO: Just move that to _misc.py? class RandomErasing(_RandomApplyTransform): + """[BETA] Randomly selects a rectangle region in a torch.Tensor image and erases its pixels. + + .. betastatus:: RandomErasing transform + + This transform does not support PIL Image. + 'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896 + + Args: + p: probability that the random erasing operation will be performed. + scale: range of proportion of erased area against input image. + ratio: range of aspect ratio of erased area. + value: erasing value. Default is 0. If a single int, it is used to + erase all pixels. If a tuple of length 3, it is used to erase + R, G, B channels respectively. + If a str of 'random', erasing each pixel with random values. + inplace: boolean to make this transform inplace. Default set to False. + + Returns: + Erased Image. + + Example: + >>> transform = transforms.Compose([ + >>> transforms.RandomHorizontalFlip(), + >>> transforms.PILToTensor(), + >>> transforms.ConvertImageDtype(torch.float), + >>> transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + >>> transforms.RandomErasing(), + >>> ]) + """ + _v1_transform_cls = _transforms.RandomErasing def _extract_params_for_v1_transform(self) -> Dict[str, Any]: diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py index b4791755dc5..98e23b99796 100644 --- a/torchvision/transforms/v2/_auto_augment.py +++ b/torchvision/transforms/v2/_auto_augment.py @@ -162,6 +162,24 @@ def _apply_image_or_video_transform( class AutoAugment(_AutoAugmentBase): + r"""[BETA] AutoAugment data augmentation method based on + `"AutoAugment: Learning Augmentation Strategies from Data" `_. + + .. betastatus:: AutoAugment transform + + If the image is torch Tensor, it should be of type torch.uint8, and it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + policy (AutoAugmentPolicy): Desired policy enum defined by + :class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + fill (sequence or number, optional): Pixel fill value for the area outside the transformed + image. If given a number, the value is used for all bands respectively. + """ _v1_transform_cls = _transforms.AutoAugment _AUGMENTATION_SPACE = { @@ -318,6 +336,27 @@ def forward(self, *inputs: Any) -> Any: class RandAugment(_AutoAugmentBase): + r"""[BETA] RandAugment data augmentation method based on + `"RandAugment: Practical automated data augmentation with a reduced search space" + `_. + + .. betastatus:: RandAugment transform + + If the image is torch Tensor, it should be of type torch.uint8, and it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + num_ops (int): Number of augmentation transformations to apply sequentially. + magnitude (int): Magnitude for all the transformations. + num_magnitude_bins (int): The number of different magnitude values. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + fill (sequence or number, optional): Pixel fill value for the area outside the transformed + image. If given a number, the value is used for all bands respectively. + """ + _v1_transform_cls = _transforms.RandAugment _AUGMENTATION_SPACE = { "Identity": (lambda num_bins, height, width: None, False), @@ -379,6 +418,24 @@ def forward(self, *inputs: Any) -> Any: class TrivialAugmentWide(_AutoAugmentBase): + r"""[BETA] Dataset-independent data-augmentation with TrivialAugment Wide, as described in + `"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" `_. + + .. betastatus:: TrivialAugmentWide transform + + If the image is torch Tensor, it should be of type torch.uint8, and it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + num_magnitude_bins (int): The number of different magnitude values. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + fill (sequence or number, optional): Pixel fill value for the area outside the transformed + image. If given a number, the value is used for all bands respectively. + """ + _v1_transform_cls = _transforms.TrivialAugmentWide _AUGMENTATION_SPACE = { "Identity": (lambda num_bins, height, width: None, False), @@ -430,6 +487,29 @@ def forward(self, *inputs: Any) -> Any: class AugMix(_AutoAugmentBase): + r"""[BETA] AugMix data augmentation method based on + `"AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty" `_. + + .. betastatus:: AugMix transform + + If the image is torch Tensor, it should be of type torch.uint8, and it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + severity (int): The severity of base augmentation operators. Default is ``3``. + mixture_width (int): The number of augmentation chains. Default is ``3``. + chain_depth (int): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3]. + Default is ``-1``. + alpha (float): The hyperparameter for the probability distributions. Default is ``1.0``. + all_ops (bool): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + fill (sequence or number, optional): Pixel fill value for the area outside the transformed + image. If given a number, the value is used for all bands respectively. + """ + _v1_transform_cls = _transforms.AugMix _PARTIAL_AUGMENTATION_SPACE = { diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py index 64796e16ca4..526c9661991 100644 --- a/torchvision/transforms/v2/_color.py +++ b/torchvision/transforms/v2/_color.py @@ -11,6 +11,23 @@ class Grayscale(Transform): + """[BETA] Convert image to grayscale. + + .. betastatus:: Grayscale transform + + If the image is torch Tensor, it is expected + to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions + + Args: + num_output_channels (int): (1 or 3) number of channels desired for output image + + Returns: + PIL Image: Grayscale version of the input. + + - If ``num_output_channels == 1`` : returned image is single channel + - If ``num_output_channels == 3`` : returned image is 3 channel with r == g == b + """ + _v1_transform_cls = _transforms.Grayscale _transformed_types = ( @@ -29,6 +46,24 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomGrayscale(_RandomApplyTransform): + """[BETA] Randomly convert image to grayscale with a probability of p (default 0.1). + + .. betastatus:: RandomGrayscale transform + + If the image is torch Tensor, it is expected + to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions + + Args: + p (float): probability that image should be converted to grayscale. + + Returns: + PIL Image or Tensor: Grayscale version of the input image with probability p and unchanged + with probability (1-p). + - If input image is 1 channel: grayscale version is 1 channel + - If input image is 3 channel: grayscale version is 3 channel with r == g == b + + """ + _v1_transform_cls = _transforms.RandomGrayscale _transformed_types = ( @@ -50,6 +85,32 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class ColorJitter(Transform): + """[BETA] Randomly change the brightness, contrast, saturation and hue of an image. + + .. betastatus:: ColorJitter transform + + If the image is torch Tensor, it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported. + + Args: + brightness (float or tuple of float (min, max)): How much to jitter brightness. + brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness] + or the given [min, max]. Should be non negative numbers. + contrast (float or tuple of float (min, max)): How much to jitter contrast. + contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast] + or the given [min, max]. Should be non-negative numbers. + saturation (float or tuple of float (min, max)): How much to jitter saturation. + saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation] + or the given [min, max]. Should be non negative numbers. + hue (float or tuple of float (min, max)): How much to jitter hue. + hue_factor is chosen uniformly from [-hue, hue] or the given [min, max]. + Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5. + To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space; + thus it does not work if you normalize your image to an interval with negative values, + or use an interpolation that generates negative values before using this function. + """ + _v1_transform_cls = _transforms.ColorJitter def _extract_params_for_v1_transform(self) -> Dict[str, Any]: @@ -205,6 +266,18 @@ def _transform( class RandomEqualize(_RandomApplyTransform): + """[BETA] Equalize the histogram of the given image randomly with a given probability. + + .. betastatus:: RandomEqualize transform + + If the image is torch Tensor, it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "P", "L" or "RGB". + + Args: + p (float): probability of the image being equalized. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomEqualize def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -212,6 +285,18 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomInvert(_RandomApplyTransform): + """[BETA] Inverts the colors of the given image randomly with a given probability. + + .. betastatus:: RandomInvert transform + + If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format, + where ... means it can have an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + p (float): probability of the image being color inverted. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomInvert def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -219,6 +304,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomPosterize(_RandomApplyTransform): + """[BETA] Posterize the image randomly with a given probability by reducing the + number of bits for each color channel. + + .. betastatus:: RandomPosterize transform + + If the image is torch Tensor, it should be of type torch.uint8, + and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + bits (int): number of bits to keep for each channel (0-8) + p (float): probability of the image being posterized. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomPosterize def __init__(self, bits: int, p: float = 0.5) -> None: @@ -230,6 +329,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomSolarize(_RandomApplyTransform): + """[BETA] Solarize the image randomly with a given probability by inverting all pixel + values above a threshold. + + .. betastatus:: RandomSolarize transform + + If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format, + where ... means it can have an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + threshold (float): all pixels equal or above this value are inverted. + p (float): probability of the image being solarized. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomSolarize def __init__(self, threshold: float, p: float = 0.5) -> None: @@ -241,6 +354,18 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomAutocontrast(_RandomApplyTransform): + """[BETA] Autocontrast the pixels of the given image randomly with a given probability. + + .. betastatus:: RandomAutocontrast transform + + If the image is torch Tensor, it is expected + to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + If img is PIL Image, it is expected to be in mode "L" or "RGB". + + Args: + p (float): probability of the image being autocontrasted. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomAutocontrast def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -248,6 +373,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomAdjustSharpness(_RandomApplyTransform): + """[BETA] Adjust the sharpness of the image randomly with a given probability. + + .. betastatus:: RandomAdjustSharpness transform + + If the image is torch Tensor, + it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. + + Args: + sharpness_factor (float): How much to adjust the sharpness. Can be + any non-negative number. 0 gives a blurred image, 1 gives the + original image while 2 increases the sharpness by a factor of 2. + p (float): probability of the image being sharpened. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomAdjustSharpness def __init__(self, sharpness_factor: float, p: float = 0.5) -> None: diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py index 555010fda1e..66da9c187c0 100644 --- a/torchvision/transforms/v2/_container.py +++ b/torchvision/transforms/v2/_container.py @@ -9,6 +9,37 @@ class Compose(Transform): + """[BETA] Composes several transforms together. + + .. betastatus:: Compose transform + + This transform does not support torchscript. + Please, see the note below. + + Args: + transforms (list of ``Transform`` objects): list of transforms to compose. + + Example: + >>> transforms.Compose([ + >>> transforms.CenterCrop(10), + >>> transforms.PILToTensor(), + >>> transforms.ConvertImageDtype(torch.float), + >>> ]) + + .. note:: + In order to script the transformations, please use ``torch.nn.Sequential`` as below. + + >>> transforms = torch.nn.Sequential( + >>> transforms.CenterCrop(10), + >>> transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + >>> ) + >>> scripted_transforms = torch.jit.script(transforms) + + Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require + `lambda` functions or ``PIL.Image``. + + """ + def __init__(self, transforms: Sequence[Callable]) -> None: super().__init__() if not isinstance(transforms, Sequence): @@ -29,6 +60,27 @@ def extra_repr(self) -> str: class RandomApply(Transform): + """[BETA] Apply randomly a list of transformations with a given probability. + + .. betastatus:: RandomApply transform + + .. note:: + In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of + transforms as shown below: + + >>> transforms = transforms.RandomApply(torch.nn.ModuleList([ + >>> transforms.ColorJitter(), + >>> ]), p=0.3) + >>> scripted_transforms = torch.jit.script(transforms) + + Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require + `lambda` functions or ``PIL.Image``. + + Args: + transforms (sequence or torch.nn.Module): list of transformations + p (float): probability + """ + _v1_transform_cls = _transforms.RandomApply def __init__(self, transforms: Union[Sequence[Callable], nn.ModuleList], p: float = 0.5) -> None: @@ -63,6 +115,12 @@ def extra_repr(self) -> str: class RandomChoice(Transform): + """[BETA] Apply single transformation randomly picked from a list. + + .. betastatus:: RandomChoice transform + + This transform does not support torchscript.""" + def __init__( self, transforms: Sequence[Callable], @@ -99,6 +157,13 @@ def forward(self, *inputs: Any) -> Any: class RandomOrder(Transform): + """[BETA] Apply a list of transformations in a random order. + + .. betastatus:: RandomOrder transform + + This transform does not support torchscript. + """ + def __init__(self, transforms: Sequence[Callable]) -> None: if not isinstance(transforms, Sequence): raise TypeError("Argument transforms should be a sequence of callables") diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py index bfb0d06239f..29f2017f038 100644 --- a/torchvision/transforms/v2/_deprecated.py +++ b/torchvision/transforms/v2/_deprecated.py @@ -10,6 +10,31 @@ class ToTensor(Transform): + """[BETA] Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor. + + .. betastatus:: ToTensor transform + + .. warning:: + v2.ToTensor is deprecated and will be removed in a future release. + Please use instead ``transforms.Compose([transforms.ToImageTensor(), transforms.ConvertImageDtype()])``. + + This transform does not support torchscript. + + + Converts a PIL Image or numpy.ndarray (H x W x C) in the range + [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] + if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1) + or if the numpy.ndarray has dtype = np.uint8 + + In the other cases, tensors are returned without scaling. + + .. note:: + Because the input image is scaled to [0.0, 1.0], this transformation should not be used when + transforming target image masks. See the `references`_ for implementing the transforms for image masks. + + .. _references: https://github.com/pytorch/vision/tree/main/references/segmentation + """ + _transformed_types = (PIL.Image.Image, np.ndarray) def __init__(self) -> None: diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py index f1eed87b9c0..322b317cac5 100644 --- a/torchvision/transforms/v2/_geometry.py +++ b/torchvision/transforms/v2/_geometry.py @@ -26,6 +26,18 @@ class RandomHorizontalFlip(_RandomApplyTransform): + """[BETA] Horizontally flip the given image/box/mask randomly with a given probability. + + .. betastatus:: RandomHorizontalFlip transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading + dimensions + + Args: + p (float): probability of the image being flipped. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomHorizontalFlip def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -33,6 +45,18 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomVerticalFlip(_RandomApplyTransform): + """[BETA] Vertically flip the given image/box/mask randomly with a given probability. + + .. betastatus:: RandomVerticalFlip transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading + dimensions + + Args: + p (float): probability of the image being flipped. Default value is 0.5 + """ + _v1_transform_cls = _transforms.RandomVerticalFlip def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: @@ -40,6 +64,62 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class Resize(Transform): + """[BETA] Resize the input image/box/mask to the given size. + + .. betastatus:: Resize transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions + + .. warning:: + The output image might be different depending on its type: when downsampling, the interpolation of PIL images + and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences + in the performance of a network. Therefore, it is preferable to train and serve a model with the same input + types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors + closer. + + Args: + size (sequence or int): Desired output size. If size is a sequence like + (h, w), output size will be matched to this. If size is an int, + smaller edge of the image will be matched to this number. + i.e, if height > width, then image will be rescaled to + (size * height / width, size). + + .. note:: + In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``, + ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + max_size (int, optional): The maximum allowed for the longer edge of + the resized image: if the longer edge of the image is greater + than ``max_size`` after being resized according to ``size``, then + the image is resized again so that the longer edge is equal to + ``max_size``. As a result, ``size`` might be overruled, i.e. the + smaller edge may be shorter than ``size``. This is only supported + if ``size`` is an int (or a sequence of length 1 in torchscript + mode). + antialias (bool, optional): Whether to apply antialiasing. + It only affects **tensors** with bilinear or bicubic modes and it is + ignored otherwise: on PIL images, antialiasing is always applied on + bilinear or bicubic modes; on other modes (for PIL images and + tensors), antialiasing makes no sense and this parameter is ignored. + Possible values are: + + - ``True``: will apply antialiasing for bilinear or bicubic modes. + Other mode aren't affected. This is probably what you want to use. + - ``False``: will not apply antialiasing for tensors on any mode. PIL + images are still antialiased on bilinear or bicubic modes, because + PIL doesn't support no antialias. + - ``None``: equivalent to ``False`` for tensors and ``True`` for + PIL images. This value exists for legacy reasons and you probably + don't want to use it unless you really know what you are doing. + + The current default is ``None`` **but will change to** ``True`` **in + v0.17** for the PIL and Tensor backends to be consistent. + """ + _v1_transform_cls = _transforms.Resize def __init__( @@ -76,6 +156,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class CenterCrop(Transform): + """[BETA] Crops the given image/box/mask at the center. + + .. betastatus:: CenterCrop transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + If image size is smaller than output size along any edge, image is padded with 0 and then center cropped. + + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + """ + _v1_transform_cls = _transforms.CenterCrop def __init__(self, size: Union[int, Sequence[int]]): @@ -87,6 +181,53 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomResizedCrop(Transform): + """[BETA] Crop a random portion of image/box/mask and resize it to a given size. + + .. betastatus:: RandomResizedCrop transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions + + A crop of the original image is made: the crop has a random area (H * W) + and a random aspect ratio. This crop is finally resized to the given + size. This is popularly used to train the Inception networks. + + Args: + size (int or sequence): expected output size of the crop, for each edge. If size is an + int instead of sequence like (h, w), a square output size ``(size, size)`` is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + + .. note:: + In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``. + scale (tuple of float): Specifies the lower and upper bounds for the random area of the crop, + before resizing. The scale is defined with respect to the area of the original image. + ratio (tuple of float): lower and upper bounds for the random aspect ratio of the crop, before + resizing. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``, + ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + antialias (bool, optional): Whether to apply antialiasing. + It only affects **tensors** with bilinear or bicubic modes and it is + ignored otherwise: on PIL images, antialiasing is always applied on + bilinear or bicubic modes; on other modes (for PIL images and + tensors), antialiasing makes no sense and this parameter is ignored. + Possible values are: + + - ``True``: will apply antialiasing for bilinear or bicubic modes. + Other mode aren't affected. This is probably what you want to use. + - ``False``: will not apply antialiasing for tensors on any mode. PIL + images are still antialiased on bilinear or bicubic modes, because + PIL doesn't support no antialias. + - ``None``: equivalent to ``False`` for tensors and ``True`` for + PIL images. This value exists for legacy reasons and you probably + don't want to use it unless you really know what you are doing. + + The current default is ``None`` **but will change to** ``True`` **in + v0.17** for the PIL and Tensor backends to be consistent. + """ + _v1_transform_cls = _transforms.RandomResizedCrop def __init__( @@ -164,25 +305,23 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class FiveCrop(Transform): - """ - Example: - >>> class BatchMultiCrop(transforms.Transform): - ... def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], int]): - ... images_or_videos, labels = sample - ... batch_size = len(images_or_videos) - ... image_or_video = images_or_videos[0] - ... images_or_videos = image_or_video.wrap_like(image_or_video, torch.stack(images_or_videos)) - ... labels = torch.full((batch_size,), label, device=images_or_videos.device) - ... return images_or_videos, labels - ... - >>> image = datapoints.Image(torch.rand(3, 256, 256)) - >>> label = 3 - >>> transform = transforms.Compose([transforms.FiveCrop(224), BatchMultiCrop()]) - >>> images, labels = transform(image, label) - >>> images.shape - torch.Size([5, 3, 224, 224]) - >>> labels - tensor([3, 3, 3, 3, 3]) + """[BETA] Crop the given image/box/mask into four corners and the central crop. + + .. betastatus:: FiveCrop transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading + dimensions + + .. Note:: + This transform returns a tuple of images and there may be a mismatch in the number of + inputs and targets your Dataset returns. See below for an example of how to deal with + this. + + Args: + size (sequence or int): Desired output size of the crop. If size is an ``int`` + instead of sequence like (h, w), a square crop of size (size, size) is made. + If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). """ _v1_transform_cls = _transforms.FiveCrop @@ -209,8 +348,25 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None: class TenCrop(Transform): - """ - See :class:`~torchvision.transforms.v2.FiveCrop` for an example. + """[BETA] Crop the given image/box/mask into four corners and the central crop plus the flipped version of + these (horizontal flipping is used by default). + + .. betastatus:: TenCrop transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading + dimensions + + .. Note:: + This transform returns a tuple of images and there may be a mismatch in the number of + inputs and targets your Dataset returns. See below for an example of how to deal with + this. + + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + vertical_flip (bool): Use vertical flipping instead of horizontal """ _v1_transform_cls = _transforms.TenCrop @@ -249,6 +405,46 @@ def _transform( class Pad(Transform): + """[BETA] Pad the given image/box/mask on all sides with the given "pad" value. + + .. betastatus:: Pad transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means at most 2 leading dimensions for mode reflect and symmetric, + at most 3 leading dimensions for mode edge, + and an arbitrary number of leading dimensions for mode constant + + Args: + padding (int or sequence): Padding on each border. If a single int is provided this + is used to pad all borders. If sequence of length 2 is provided this is the padding + on left/right and top/bottom respectively. If a sequence of length 4 is provided + this is the padding for the left, top, right and bottom borders respectively. + + .. note:: + In torchscript mode padding as single int is not supported, use a sequence of + length 1: ``[padding, ]``. + fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of + length 3, it is used to fill R, G, B channels respectively. + This value is only used when the padding_mode is constant. + Only number is supported for torch Tensor. + Only int or tuple value is supported for PIL Image. + padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. + Default is constant. + + - constant: pads with a constant value, this value is specified with fill + + - edge: pads with the last value at the edge of the image. + If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2 + + - reflect: pads with reflection of image without repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode + will result in [3, 2, 1, 2, 3, 4, 3, 2] + + - symmetric: pads with reflection of image repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode + will result in [2, 1, 1, 2, 3, 4, 4, 3] + """ + _v1_transform_cls = _transforms.Pad def _extract_params_for_v1_transform(self) -> Dict[str, Any]: @@ -323,6 +519,34 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomRotation(Transform): + """[BETA] Rotate the image/box/mask by angle. + + .. betastatus:: RandomRotation transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + + Args: + degrees (sequence or number): Range of degrees to select from. + If degrees is a number instead of sequence like (min, max), the range of degrees + will be (-degrees, +degrees). + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + expand (bool, optional): Optional expansion flag. + If true, expands the output to make it large enough to hold the entire rotated image. + If false or omitted, make the output image the same size as the input image. + Note that the expand flag assumes rotation around the center and no translation. + center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner. + Default is the center of the image. + fill (sequence or number): Pixel fill value for the area outside the rotated + image. Default is ``0``. If given a number, the value is used for all bands respectively. + + .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters + + """ + _v1_transform_cls = _transforms.RandomRotation def __init__( @@ -363,6 +587,42 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomAffine(Transform): + """[BETA] Random affine transformation of the image/box/mask keeping center invariant. + + .. betastatus:: RandomAffine transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + + Args: + degrees (sequence or number): Range of degrees to select from. + If degrees is a number instead of sequence like (min, max), the range of degrees + will be (-degrees, +degrees). Set to 0 to deactivate rotations. + translate (tuple, optional): tuple of maximum absolute fraction for horizontal + and vertical translations. For example translate=(a, b), then horizontal shift + is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is + randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default. + scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is + randomly sampled from the range a <= scale <= b. Will keep original scale by default. + shear (sequence or number, optional): Range of degrees to select from. + If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear) + will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the + range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values, + an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied. + Will not apply shear by default. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + fill (sequence or number): Pixel fill value for the area outside the transformed + image. Default is ``0``. If given a number, the value is used for all bands respectively. + center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner. + Default is the center of the image. + + .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters + + """ + _v1_transform_cls = _transforms.RandomAffine def __init__( @@ -443,6 +703,52 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomCrop(Transform): + """[BETA] Crop the given image/box/mask at a random location. + + .. betastatus:: RandomCrop transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions, + but if non-constant padding is used, the input is expected to have at most 2 leading dimensions + + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + padding (int or sequence, optional): Optional padding on each border + of the image. Default is None. If a single int is provided this + is used to pad all borders. If sequence of length 2 is provided this is the padding + on left/right and top/bottom respectively. If a sequence of length 4 is provided + this is the padding for the left, top, right and bottom borders respectively. + + .. note:: + In torchscript mode padding as single int is not supported, use a sequence of + length 1: ``[padding, ]``. + pad_if_needed (boolean): It will pad the image if smaller than the + desired size to avoid raising an exception. Since cropping is done + after padding, the padding seems to be done at a random offset. + fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of + length 3, it is used to fill R, G, B channels respectively. + This value is only used when the padding_mode is constant. + Only number is supported for torch Tensor. + Only int or tuple value is supported for PIL Image. + padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. + Default is constant. + + - constant: pads with a constant value, this value is specified with fill + + - edge: pads with the last value at the edge of the image. + If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2 + + - reflect: pads with reflection of image without repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode + will result in [3, 2, 1, 2, 3, 4, 3, 2] + + - symmetric: pads with reflection of image repeating the last value on the edge. + For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode + will result in [2, 1, 1, 2, 3, 4, 4, 3] + """ + _v1_transform_cls = _transforms.RandomCrop def _extract_params_for_v1_transform(self) -> Dict[str, Any]: @@ -552,6 +858,25 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class RandomPerspective(_RandomApplyTransform): + """[BETA] Performs a random perspective transformation of the given image/box/mask with a given probability. + + .. betastatus:: RandomPerspective transform + + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + + Args: + distortion_scale (float): argument to control the degree of distortion and ranges from 0 to 1. + Default is 0.5. + p (float): probability of the image being transformed. Default is 0.5. + interpolation (InterpolationMode): Desired interpolation enum defined by + :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``. + If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported. + The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well. + fill (sequence or number): Pixel fill value for the area outside the transformed + image. Default is ``0``. If given a number, the value is used for all bands respectively. + """ + _v1_transform_cls = _transforms.RandomPerspective def __init__( diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py index 0d1544094ca..7d0f0ec39f9 100644 --- a/torchvision/transforms/v2/_meta.py +++ b/torchvision/transforms/v2/_meta.py @@ -22,6 +22,27 @@ def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> da class ConvertDtype(Transform): + """[BETA] Convert a tensor image/box/mask to the given ``dtype`` and scale the values accordingly + + .. betastatus:: ConvertDtype transform + + This function does not support PIL Image. + + Args: + dtype (torch.dtype): Desired data type of the output + + .. note:: + + When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly. + If converted back and forth, this mismatch has no effect. + + Raises: + RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as + well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to + overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range + of the integer ``dtype``. + """ + _v1_transform_cls = _transforms.ConvertImageDtype _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video) diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index 6dd0755cfbb..4c260f97b13 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -21,6 +21,16 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class Lambda(Transform): + """[BETA] Apply a user-defined lambda as a transform. + + .. betastatus:: Lambda transform + + This transform does not support torchscript. + + Args: + lambd (function): Lambda/function to be used for transform. + """ + def __init__(self, lambd: Callable[[Any], Any], *types: Type): super().__init__() self.lambd = lambd @@ -42,6 +52,26 @@ def extra_repr(self) -> str: class LinearTransformation(Transform): + """[BETA] Transform a tensor image with a square transformation matrix and a mean_vector computed offline. + + .. betastatus:: LinearTransformation transform + + This transform does not support PIL Image. + Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and + subtract mean_vector from it which is then followed by computing the dot + product with the transformation matrix and then reshaping the tensor to its + original shape. + + Applications: + whitening transformation: Suppose X is a column vector zero-centered data. + Then compute the data covariance matrix [D x D] with torch.mm(X.t(), X), + perform SVD on this matrix and pass it as transformation_matrix. + + Args: + transformation_matrix (Tensor): tensor [D x D], D = C x H x W + mean_vector (Tensor): tensor [D], D = C x H x W + """ + _v1_transform_cls = _transforms.LinearTransformation _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video) @@ -105,6 +135,26 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class Normalize(Transform): + """[BETA] Normalize a tensor image with mean and standard deviation. + + .. betastatus:: Normalize transform + + This transform does not support PIL Image. + Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n`` + channels, this transform will normalize each channel of the input + ``torch.*Tensor`` i.e., + ``output[channel] = (input[channel] - mean[channel]) / std[channel]`` + + .. note:: + This transform acts out of place, i.e., it does not mutate the input tensor. + + Args: + mean (sequence): Sequence of means for each channel. + std (sequence): Sequence of standard deviations for each channel. + inplace(bool,optional): Bool to make this operation in-place. + + """ + _v1_transform_cls = _transforms.Normalize _transformed_types = (datapoints.Image, is_simple_tensor, datapoints.Video) @@ -124,7 +174,26 @@ def _transform( return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace) +# TODO: This should be in the _color.py file! class GaussianBlur(Transform): + """[BETA] Blurs image with randomly chosen Gaussian blur. + + .. betastatus:: GausssianBlur transform + + If the image is torch Tensor, it is expected + to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions. + + Args: + kernel_size (int or sequence): Size of the Gaussian kernel. + sigma (float or tuple of float (min, max)): Standard deviation to be used for + creating kernel to perform blurring. If float, sigma is fixed. If it is tuple + of float (min, max), sigma is chosen uniformly at random to lie in the + given range. + + Returns: + PIL Image or Tensor: Gaussian blurred version of the input image. + """ + _v1_transform_cls = _transforms.GaussianBlur def __init__( diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py index 984d5ba50c0..b0743feb10d 100644 --- a/torchvision/transforms/v2/_type_conversion.py +++ b/torchvision/transforms/v2/_type_conversion.py @@ -11,6 +11,15 @@ class PILToTensor(Transform): + """[BETA] Convert a ``PIL Image`` to a tensor of the same type. + + .. betastatus:: PILToTensor transform + + This transform does not support torchscript. + + Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W). + """ + _transformed_types = (PIL.Image.Image,) def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Tensor: @@ -27,6 +36,27 @@ def _transform( class ToImagePIL(Transform): + """[BETA] Convert a tensor or an ndarray to PIL Image. + + .. betastatus:: ToImagePIL transform + + This transform does not support torchscript. + + Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape + H x W x C to a PIL Image while preserving the value range. + + Args: + mode (`PIL.Image mode`_): color space and pixel depth of input data (optional). + If ``mode`` is ``None`` (default) there are some assumptions made about the input data: + - If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``. + - If the input has 3 channels, the ``mode`` is assumed to be ``RGB``. + - If the input has 2 channels, the ``mode`` is assumed to be ``LA``. + - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``, + ``short``). + + .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes + """ + _transformed_types = (is_simple_tensor, datapoints.Image, np.ndarray) def __init__(self, mode: Optional[str] = None) -> None: From 11d295cc9609718173806c0db647f279c034f531 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 Feb 2023 16:31:41 +0000 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: vfdev Co-authored-by: Philip Meier --- torchvision/transforms/v2/_augment.py | 6 ++++-- torchvision/transforms/v2/_color.py | 2 +- torchvision/transforms/v2/_deprecated.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py index 405fac15910..4473b23de6f 100644 --- a/torchvision/transforms/v2/_augment.py +++ b/torchvision/transforms/v2/_augment.py @@ -14,7 +14,7 @@ # TODO: Just move that to _misc.py? class RandomErasing(_RandomApplyTransform): - """[BETA] Randomly selects a rectangle region in a torch.Tensor image and erases its pixels. + """[BETA] Randomly selects a rectangle region in the input image or video and erases its pixels. .. betastatus:: RandomErasing transform @@ -32,9 +32,11 @@ class RandomErasing(_RandomApplyTransform): inplace: boolean to make this transform inplace. Default set to False. Returns: - Erased Image. + Erased input. Example: + >>> from torchvision.transforms import v2 as transforms + >>> >>> transform = transforms.Compose([ >>> transforms.RandomHorizontalFlip(), >>> transforms.PILToTensor(), diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py index 526c9661991..785a3965e60 100644 --- a/torchvision/transforms/v2/_color.py +++ b/torchvision/transforms/v2/_color.py @@ -11,7 +11,7 @@ class Grayscale(Transform): - """[BETA] Convert image to grayscale. + """[BETA] Convert images or videos to grayscale. .. betastatus:: Grayscale transform diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py index 29f2017f038..c44e6b08d11 100644 --- a/torchvision/transforms/v2/_deprecated.py +++ b/torchvision/transforms/v2/_deprecated.py @@ -15,7 +15,7 @@ class ToTensor(Transform): .. betastatus:: ToTensor transform .. warning:: - v2.ToTensor is deprecated and will be removed in a future release. + :class:`v2.ToTensor` is deprecated and will be removed in a future release. Please use instead ``transforms.Compose([transforms.ToImageTensor(), transforms.ConvertImageDtype()])``. This transform does not support torchscript. From 6a414fecfa4eace10942e4ec78724f49c7f30d67 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 21 Feb 2023 16:34:36 +0000 Subject: [PATCH 3/3] Address comments --- torchvision/transforms/v2/_augment.py | 1 - torchvision/transforms/v2/_geometry.py | 23 ++++++++++++++++++++++- torchvision/transforms/v2/_misc.py | 1 - 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py index 4473b23de6f..b5aac9ca9a2 100644 --- a/torchvision/transforms/v2/_augment.py +++ b/torchvision/transforms/v2/_augment.py @@ -12,7 +12,6 @@ from .utils import is_simple_tensor, query_chw -# TODO: Just move that to _misc.py? class RandomErasing(_RandomApplyTransform): """[BETA] Randomly selects a rectangle region in the input image or video and erases its pixels. diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py index 322b317cac5..af8ca4b6471 100644 --- a/torchvision/transforms/v2/_geometry.py +++ b/torchvision/transforms/v2/_geometry.py @@ -322,6 +322,25 @@ class FiveCrop(Transform): size (sequence or int): Desired output size of the crop. If size is an ``int`` instead of sequence like (h, w), a square crop of size (size, size) is made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + + Example: + >>> class BatchMultiCrop(transforms.Transform): + ... def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], int]): + ... images_or_videos, labels = sample + ... batch_size = len(images_or_videos) + ... image_or_video = images_or_videos[0] + ... images_or_videos = image_or_video.wrap_like(image_or_video, torch.stack(images_or_videos)) + ... labels = torch.full((batch_size,), label, device=images_or_videos.device) + ... return images_or_videos, labels + ... + >>> image = datapoints.Image(torch.rand(3, 256, 256)) + >>> label = 3 + >>> transform = transforms.Compose([transforms.FiveCrop(224), BatchMultiCrop()]) + >>> images, labels = transform(image, label) + >>> images.shape + torch.Size([5, 3, 224, 224]) + >>> labels + tensor([3, 3, 3, 3, 3]) """ _v1_transform_cls = _transforms.FiveCrop @@ -355,7 +374,9 @@ class TenCrop(Transform): If the image is torch Tensor, it is expected to have [..., H, W] shape, where ... means an arbitrary number of leading - dimensions + dimensions. + + See :class:`~torchvision.transforms.v2.FiveCrop` for an example. .. Note:: This transform returns a tuple of images and there may be a mismatch in the number of diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index 4c260f97b13..6998d416c91 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -174,7 +174,6 @@ def _transform( return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace) -# TODO: This should be in the _color.py file! class GaussianBlur(Transform): """[BETA] Blurs image with randomly chosen Gaussian blur.