From e1044f90d7abe90c3ccd6ca9ea0d323fd51ac4cd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 Feb 2023 16:44:55 +0000 Subject: [PATCH 1/6] Drive-by cahnges --- torchvision/transforms/v2/_color.py | 33 ++++++++++------------------- torchvision/transforms/v2/_misc.py | 12 +++++------ 2 files changed, 16 insertions(+), 29 deletions(-) diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py index 2a581bf5640..98ee9eb5490 100644 --- a/torchvision/transforms/v2/_color.py +++ b/torchvision/transforms/v2/_color.py @@ -15,17 +15,11 @@ class Grayscale(Transform): .. betastatus:: Grayscale transform - If the image is torch Tensor, it is expected - to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions + If the input is a Tensor, it is expected + to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions Args: num_output_channels (int): (1 or 3) number of channels desired for output image - - Returns: - PIL Image: Grayscale version of the input. - - - If ``num_output_channels == 1`` : returned image is single channel - - If ``num_output_channels == 3`` : returned image is 3 channel with r == g == b """ _v1_transform_cls = _transforms.Grayscale @@ -50,18 +44,13 @@ class RandomGrayscale(_RandomApplyTransform): .. betastatus:: RandomGrayscale transform - If the image is torch Tensor, it is expected - to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions + If the input is a Tensor, it is expected to have [..., 3 or 1, H, W] shape, + where ... means an arbitrary number of leading dimensions + + The output has the same number of channels as the input. Args: p (float): probability that image should be converted to grayscale. - - Returns: - PIL Image or Tensor: Grayscale version of the input image with probability p and unchanged - with probability (1-p). - - If input image is 1 channel: grayscale version is 1 channel - - If input image is 3 channel: grayscale version is 3 channel with r == g == b - """ _v1_transform_cls = _transforms.RandomGrayscale @@ -89,7 +78,7 @@ class ColorJitter(Transform): .. betastatus:: ColorJitter transform - If the image is torch Tensor, it is expected + If the input is a Tensor, it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported. @@ -295,7 +284,7 @@ class RandomEqualize(_RandomApplyTransform): .. betastatus:: RandomEqualize transform - If the image is torch Tensor, it is expected + If the input is a Tensor, it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. If img is PIL Image, it is expected to be in mode "P", "L" or "RGB". @@ -334,7 +323,7 @@ class RandomPosterize(_RandomApplyTransform): .. betastatus:: RandomPosterize transform - If the image is torch Tensor, it should be of type torch.uint8, + If the input is a Tensor, it should be of type torch.uint8, and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. If img is PIL Image, it is expected to be in mode "L" or "RGB". @@ -383,7 +372,7 @@ class RandomAutocontrast(_RandomApplyTransform): .. betastatus:: RandomAutocontrast transform - If the image is torch Tensor, it is expected + If the input is a Tensor, it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. If img is PIL Image, it is expected to be in mode "L" or "RGB". @@ -402,7 +391,7 @@ class RandomAdjustSharpness(_RandomApplyTransform): .. betastatus:: RandomAdjustSharpness transform - If the image is torch Tensor, + If the input is a Tensor, it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. Args: diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index 8cc4aa6a3db..482b8cd354b 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -15,13 +15,14 @@ from .utils import has_any, is_simple_tensor, query_bounding_box +# TODO: do we want/need to expose this? class Identity(Transform): def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: return inpt class Lambda(Transform): - """[BETA] Apply a user-defined lambda as a transform. + """[BETA] Apply a user-defined function as a transform. .. betastatus:: Lambda transform @@ -52,7 +53,7 @@ def extra_repr(self) -> str: class LinearTransformation(Transform): - """[BETA] Transform a tensor image with a square transformation matrix and a mean_vector computed offline. + """[BETA] Transform a tensor image or video with a square transformation matrix and a mean_vector computed offline. .. betastatus:: LinearTransformation transform @@ -135,7 +136,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class Normalize(Transform): - """[BETA] Normalize a tensor image with mean and standard deviation. + """[BETA] Normalize a tensor image or video with mean and standard deviation. .. betastatus:: Normalize transform @@ -179,7 +180,7 @@ class GaussianBlur(Transform): .. betastatus:: GausssianBlur transform - If the image is torch Tensor, it is expected + If the input is a Tensor, it is expected to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions. Args: @@ -188,9 +189,6 @@ class GaussianBlur(Transform): creating kernel to perform blurring. If float, sigma is fixed. If it is tuple of float (min, max), sigma is chosen uniformly at random to lie in the given range. - - Returns: - PIL Image or Tensor: Gaussian blurred version of the input image. """ _v1_transform_cls = _transforms.GaussianBlur From cca2557e20de156e24bc0811da51f4c58681c93a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 23 Feb 2023 17:08:01 +0000 Subject: [PATCH 2/6] Misc docs --- docs/source/transforms.rst | 2 ++ torchvision/transforms/v2/_misc.py | 37 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index c2e9855d9e8..0ab9b066dad 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -182,6 +182,7 @@ Miscellaneous v2.RandomErasing Lambda v2.Lambda + v2.SanitizeBoundingBoxes .. _conversion_transforms: @@ -202,6 +203,7 @@ Conversion ConvertImageDtype v2.ConvertImageDtype v2.ConvertDtype + v2.ToDtype Auto-Augmentation ----------------- diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index 482b8cd354b..c95394004ef 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -223,6 +223,15 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class ToDtype(Transform): + """[BETA] Converts the input to a specific dtype. + + .. betastatus:: ToDtype transform + + Args: + dtype (dtype or dict of Datapoint -> dtype): The dtype to convert to. A dict can be passed to specify + per-datapoint conversions, e.g. ``dtype={datapoints.Image: torch.float32, datapoints.Video: torch.float64}``. + """ + _transformed_types = (torch.Tensor,) def __init__(self, dtype: Union[torch.dtype, Dict[Type, Optional[torch.dtype]]]) -> None: @@ -245,6 +254,34 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class SanitizeBoundingBoxes(Transform): + """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels. + + .. betastatus:: SanitizeBoundingBoxes transform + + This transform removes bounding boxes and their associated labels that: + + - are below a given ``min_size``: by default this also removes degenerate boxes that have e.g. X2 <= X1. + - have any coordinate outside of their corresponding image. You may want to + call :class:`~torchvision.transforms.v2.ClampBoundingBox` first to avoid undesired removals. + + It is recommended to call it at the end of a pipeline, before passing the + input to the models. It is critical to call this transform if + :class:`~torchvision.transforms.v2.RandomIoUCrop` was called. + If you want to be extra careful, you may call it after all transforms that + may modify bounding boxes but once at the end should be enough in most + cases. + + Args: + min_size (float, optional) The size below which bounding boxes are removed. Default is 1. + labels_getter (callable or str or None, optional): indicates how to identify the labels in the input. + It can be a str in which case the input is expected to be a dict, and ``labels_getter`` then specifies + the key whose value corresponds to the labels. It can also be a callable that takes the same input + as the transform, and returns the labels. + By default, this will try to find a "labels" key in the input, if + the input is a dict or it is a tuple whose second element is a dict. + This heuristic should work well with a lot of datasets, including the built-in torchvision datasets. + """ + # This removes boxes and their corresponding labels: # - small or degenerate bboxes based on min_size (this includes those where X2 <= X1 or Y2 <= Y1) # - boxes with any coordinate outside the range of the image (negative, or > spatial_size) From f20810f1287192b05e12b495525a6755044cb7ce Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 Feb 2023 09:37:55 +0000 Subject: [PATCH 3/6] Update torchvision/transforms/v2/_misc.py Co-authored-by: Philip Meier --- torchvision/transforms/v2/_misc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index c95394004ef..f3591c09bea 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -254,7 +254,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class SanitizeBoundingBoxes(Transform): - """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels. + """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and optionally masks. .. betastatus:: SanitizeBoundingBoxes transform From d03b1911d724a79ec48c3f0d44f0779f3d692a50 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 Feb 2023 09:38:03 +0000 Subject: [PATCH 4/6] Update torchvision/transforms/v2/_color.py Co-authored-by: vfdev --- torchvision/transforms/v2/_color.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py index 98ee9eb5490..757949f3e33 100644 --- a/torchvision/transforms/v2/_color.py +++ b/torchvision/transforms/v2/_color.py @@ -15,7 +15,7 @@ class Grayscale(Transform): .. betastatus:: Grayscale transform - If the input is a Tensor, it is expected + If the input is a :class:`torch.Tensor`, it is expected to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions Args: From 04c90abc52a2bfdbbc6a8d0b3fef0ddbc0457d18 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 Feb 2023 09:41:30 +0000 Subject: [PATCH 5/6] Mention masks --- torchvision/transforms/v2/_misc.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index c8cbe81807f..2237334f7a2 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -254,11 +254,11 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class SanitizeBoundingBox(Transform): - """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and optionally masks. + """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and masks. .. betastatus:: SanitizeBoundingBox transform - This transform removes bounding boxes and their associated labels that: + This transform removes bounding boxes and their associated labels/masks that: - are below a given ``min_size``: by default this also removes degenerate boxes that have e.g. X2 <= X1. - have any coordinate outside of their corresponding image. You may want to @@ -282,10 +282,6 @@ class SanitizeBoundingBox(Transform): This heuristic should work well with a lot of datasets, including the built-in torchvision datasets. """ - This removes boxes and their corresponding labels: - # - small or degenerate bboxes based on min_size (this includes those where X2 <= X1 or Y2 <= Y1) - # - boxes with any coordinate outside the range of the image (negative, or > spatial_size) - def __init__( self, min_size: float = 1.0, From 32d3d563edcd859cb89c59828785eaa0f8d45b8c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 24 Feb 2023 09:43:43 +0000 Subject: [PATCH 6/6] Tensor to :class: --- torchvision/transforms/v2/_color.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py index 757949f3e33..237e8d6181a 100644 --- a/torchvision/transforms/v2/_color.py +++ b/torchvision/transforms/v2/_color.py @@ -44,7 +44,7 @@ class RandomGrayscale(_RandomApplyTransform): .. betastatus:: RandomGrayscale transform - If the input is a Tensor, it is expected to have [..., 3 or 1, H, W] shape, + If the input is a :class:`torch.Tensor`, it is expected to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions The output has the same number of channels as the input. @@ -78,7 +78,7 @@ class ColorJitter(Transform): .. betastatus:: ColorJitter transform - If the input is a Tensor, it is expected + If the input is a :class:`torch.Tensor`, it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported. @@ -284,7 +284,7 @@ class RandomEqualize(_RandomApplyTransform): .. betastatus:: RandomEqualize transform - If the input is a Tensor, it is expected + If the input is a :class:`torch.Tensor`, it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. If img is PIL Image, it is expected to be in mode "P", "L" or "RGB". @@ -323,7 +323,7 @@ class RandomPosterize(_RandomApplyTransform): .. betastatus:: RandomPosterize transform - If the input is a Tensor, it should be of type torch.uint8, + If the input is a :class:`torch.Tensor`, it should be of type torch.uint8, and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. If img is PIL Image, it is expected to be in mode "L" or "RGB". @@ -372,7 +372,7 @@ class RandomAutocontrast(_RandomApplyTransform): .. betastatus:: RandomAutocontrast transform - If the input is a Tensor, it is expected + If the input is a :class:`torch.Tensor`, it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. If img is PIL Image, it is expected to be in mode "L" or "RGB". @@ -391,7 +391,7 @@ class RandomAdjustSharpness(_RandomApplyTransform): .. betastatus:: RandomAdjustSharpness transform - If the input is a Tensor, + If the input is a :class:`torch.Tensor`, it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions. Args: