From e1044f90d7abe90c3ccd6ca9ea0d323fd51ac4cd Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 23 Feb 2023 16:44:55 +0000
Subject: [PATCH 1/6] Drive-by cahnges

---
 torchvision/transforms/v2/_color.py | 33 ++++++++++-------------------
 torchvision/transforms/v2/_misc.py  | 12 +++++------
 2 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 2a581bf5640..98ee9eb5490 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -15,17 +15,11 @@ class Grayscale(Transform):
 
     .. betastatus:: Grayscale transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+    If the input is a Tensor, it is expected
+    to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions
 
     Args:
         num_output_channels (int): (1 or 3) number of channels desired for output image
-
-    Returns:
-        PIL Image: Grayscale version of the input.
-
-        - If ``num_output_channels == 1`` : returned image is single channel
-        - If ``num_output_channels == 3`` : returned image is 3 channel with r == g == b
     """
 
     _v1_transform_cls = _transforms.Grayscale
@@ -50,18 +44,13 @@ class RandomGrayscale(_RandomApplyTransform):
 
     .. betastatus:: RandomGrayscale transform
 
-    If the image is torch Tensor, it is expected
-    to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+    If the input is a Tensor, it is expected to have [..., 3 or 1, H, W] shape,
+    where ... means an arbitrary number of leading dimensions
+
+    The output has the same number of channels as the input.
 
     Args:
         p (float): probability that image should be converted to grayscale.
-
-    Returns:
-        PIL Image or Tensor: Grayscale version of the input image with probability p and unchanged
-        with probability (1-p).
-        - If input image is 1 channel: grayscale version is 1 channel
-        - If input image is 3 channel: grayscale version is 3 channel with r == g == b
-
     """
 
     _v1_transform_cls = _transforms.RandomGrayscale
@@ -89,7 +78,7 @@ class ColorJitter(Transform):
 
     .. betastatus:: ColorJitter transform
 
-    If the image is torch Tensor, it is expected
+    If the input is a Tensor, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported.
 
@@ -295,7 +284,7 @@ class RandomEqualize(_RandomApplyTransform):
 
     .. betastatus:: RandomEqualize transform
 
-    If the image is torch Tensor, it is expected
+    If the input is a Tensor, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "P", "L" or "RGB".
 
@@ -334,7 +323,7 @@ class RandomPosterize(_RandomApplyTransform):
 
     .. betastatus:: RandomPosterize transform
 
-    If the image is torch Tensor, it should be of type torch.uint8,
+    If the input is a Tensor, it should be of type torch.uint8,
     and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "L" or "RGB".
 
@@ -383,7 +372,7 @@ class RandomAutocontrast(_RandomApplyTransform):
 
     .. betastatus:: RandomAutocontrast transform
 
-    If the image is torch Tensor, it is expected
+    If the input is a Tensor, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "L" or "RGB".
 
@@ -402,7 +391,7 @@ class RandomAdjustSharpness(_RandomApplyTransform):
 
     .. betastatus:: RandomAdjustSharpness transform
 
-    If the image is torch Tensor,
+    If the input is a Tensor,
     it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
 
     Args:
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 8cc4aa6a3db..482b8cd354b 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -15,13 +15,14 @@
 from .utils import has_any, is_simple_tensor, query_bounding_box
 
 
+# TODO: do we want/need to expose this?
 class Identity(Transform):
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
         return inpt
 
 
 class Lambda(Transform):
-    """[BETA] Apply a user-defined lambda as a transform.
+    """[BETA] Apply a user-defined function as a transform.
 
     .. betastatus:: Lambda transform
 
@@ -52,7 +53,7 @@ def extra_repr(self) -> str:
 
 
 class LinearTransformation(Transform):
-    """[BETA] Transform a tensor image with a square transformation matrix and a mean_vector computed offline.
+    """[BETA] Transform a tensor image or video with a square transformation matrix and a mean_vector computed offline.
 
     .. betastatus:: LinearTransformation transform
 
@@ -135,7 +136,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class Normalize(Transform):
-    """[BETA] Normalize a tensor image with mean and standard deviation.
+    """[BETA] Normalize a tensor image or video with mean and standard deviation.
 
     .. betastatus:: Normalize transform
 
@@ -179,7 +180,7 @@ class GaussianBlur(Transform):
 
     .. betastatus:: GausssianBlur transform
 
-    If the image is torch Tensor, it is expected
+    If the input is a Tensor, it is expected
     to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions.
 
     Args:
@@ -188,9 +189,6 @@ class GaussianBlur(Transform):
             creating kernel to perform blurring. If float, sigma is fixed. If it is tuple
             of float (min, max), sigma is chosen uniformly at random to lie in the
             given range.
-
-    Returns:
-        PIL Image or Tensor: Gaussian blurred version of the input image.
     """
 
     _v1_transform_cls = _transforms.GaussianBlur

From cca2557e20de156e24bc0811da51f4c58681c93a Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Thu, 23 Feb 2023 17:08:01 +0000
Subject: [PATCH 2/6] Misc docs

---
 docs/source/transforms.rst         |  2 ++
 torchvision/transforms/v2/_misc.py | 37 ++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index c2e9855d9e8..0ab9b066dad 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -182,6 +182,7 @@ Miscellaneous
     v2.RandomErasing
     Lambda
     v2.Lambda
+    v2.SanitizeBoundingBoxes
 
 .. _conversion_transforms:
 
@@ -202,6 +203,7 @@ Conversion
     ConvertImageDtype
     v2.ConvertImageDtype
     v2.ConvertDtype
+    v2.ToDtype
 
 Auto-Augmentation
 -----------------
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 482b8cd354b..c95394004ef 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -223,6 +223,15 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class ToDtype(Transform):
+    """[BETA] Converts the input to a specific dtype.
+
+    .. betastatus:: ToDtype transform
+
+    Args:
+        dtype (dtype or dict of Datapoint -> dtype): The dtype to convert to. A dict can be passed to specify
+            per-datapoint conversions, e.g. ``dtype={datapoints.Image: torch.float32, datapoints.Video: torch.float64}``.
+    """
+
     _transformed_types = (torch.Tensor,)
 
     def __init__(self, dtype: Union[torch.dtype, Dict[Type, Optional[torch.dtype]]]) -> None:
@@ -245,6 +254,34 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class SanitizeBoundingBoxes(Transform):
+    """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels.
+
+    .. betastatus:: SanitizeBoundingBoxes transform
+
+    This transform removes bounding boxes and their associated labels that:
+
+    - are below a given ``min_size``: by default this also removes degenerate boxes that have e.g. X2 <= X1.
+    - have any coordinate outside of their corresponding image. You may want to
+      call :class:`~torchvision.transforms.v2.ClampBoundingBox` first to avoid undesired removals.
+
+    It is recommended to call it at the end of a pipeline, before passing the
+    input to the models. It is critical to call this transform if
+    :class:`~torchvision.transforms.v2.RandomIoUCrop` was called.
+    If you want to be extra careful, you may call it after all transforms that
+    may modify bounding boxes but once at the end should be enough in most
+    cases.
+
+    Args:
+        min_size (float, optional) The size below which bounding boxes are removed. Default is 1.
+        labels_getter (callable or str or None, optional): indicates how to identify the labels in the input.
+            It can be a str in which case the input is expected to be a dict, and ``labels_getter`` then specifies
+            the key whose value corresponds to the labels. It can also be a callable that takes the same input
+            as the transform, and returns the labels.
+            By default, this will try to find a "labels" key in the input, if
+            the input is a dict or it is a tuple whose second element is a dict.
+            This heuristic should work well with a lot of datasets, including the built-in torchvision datasets.
+    """
+
     # This removes boxes and their corresponding labels:
     # - small or degenerate bboxes based on min_size (this includes those where X2 <= X1 or Y2 <= Y1)
     # - boxes with any coordinate outside the range of the image (negative, or > spatial_size)

From f20810f1287192b05e12b495525a6755044cb7ce Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Feb 2023 09:37:55 +0000
Subject: [PATCH 3/6] Update torchvision/transforms/v2/_misc.py

Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 torchvision/transforms/v2/_misc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index c95394004ef..f3591c09bea 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -254,7 +254,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class SanitizeBoundingBoxes(Transform):
-    """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels.
+    """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and optionally masks.
 
     .. betastatus:: SanitizeBoundingBoxes transform
 

From d03b1911d724a79ec48c3f0d44f0779f3d692a50 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Feb 2023 09:38:03 +0000
Subject: [PATCH 4/6] Update torchvision/transforms/v2/_color.py

Co-authored-by: vfdev <vfdev.5@gmail.com>
---
 torchvision/transforms/v2/_color.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 98ee9eb5490..757949f3e33 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -15,7 +15,7 @@ class Grayscale(Transform):
 
     .. betastatus:: Grayscale transform
 
-    If the input is a Tensor, it is expected
+    If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions
 
     Args:

From 04c90abc52a2bfdbbc6a8d0b3fef0ddbc0457d18 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Feb 2023 09:41:30 +0000
Subject: [PATCH 5/6] Mention masks

---
 torchvision/transforms/v2/_misc.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index c8cbe81807f..2237334f7a2 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -254,11 +254,11 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class SanitizeBoundingBox(Transform):
-    """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and optionally masks.
+    """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and masks.
 
     .. betastatus:: SanitizeBoundingBox transform
 
-    This transform removes bounding boxes and their associated labels that:
+    This transform removes bounding boxes and their associated labels/masks that:
 
     - are below a given ``min_size``: by default this also removes degenerate boxes that have e.g. X2 <= X1.
     - have any coordinate outside of their corresponding image. You may want to
@@ -282,10 +282,6 @@ class SanitizeBoundingBox(Transform):
             This heuristic should work well with a lot of datasets, including the built-in torchvision datasets.
     """
 
-     This removes boxes and their corresponding labels:
-    # - small or degenerate bboxes based on min_size (this includes those where X2 <= X1 or Y2 <= Y1)
-    # - boxes with any coordinate outside the range of the image (negative, or > spatial_size)
-
     def __init__(
         self,
         min_size: float = 1.0,

From 32d3d563edcd859cb89c59828785eaa0f8d45b8c Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Fri, 24 Feb 2023 09:43:43 +0000
Subject: [PATCH 6/6] Tensor to :class:

---
 torchvision/transforms/v2/_color.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 757949f3e33..237e8d6181a 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -44,7 +44,7 @@ class RandomGrayscale(_RandomApplyTransform):
 
     .. betastatus:: RandomGrayscale transform
 
-    If the input is a Tensor, it is expected to have [..., 3 or 1, H, W] shape,
+    If the input is a :class:`torch.Tensor`, it is expected to have [..., 3 or 1, H, W] shape,
     where ... means an arbitrary number of leading dimensions
 
     The output has the same number of channels as the input.
@@ -78,7 +78,7 @@ class ColorJitter(Transform):
 
     .. betastatus:: ColorJitter transform
 
-    If the input is a Tensor, it is expected
+    If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported.
 
@@ -284,7 +284,7 @@ class RandomEqualize(_RandomApplyTransform):
 
     .. betastatus:: RandomEqualize transform
 
-    If the input is a Tensor, it is expected
+    If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "P", "L" or "RGB".
 
@@ -323,7 +323,7 @@ class RandomPosterize(_RandomApplyTransform):
 
     .. betastatus:: RandomPosterize transform
 
-    If the input is a Tensor, it should be of type torch.uint8,
+    If the input is a :class:`torch.Tensor`, it should be of type torch.uint8,
     and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "L" or "RGB".
 
@@ -372,7 +372,7 @@ class RandomAutocontrast(_RandomApplyTransform):
 
     .. betastatus:: RandomAutocontrast transform
 
-    If the input is a Tensor, it is expected
+    If the input is a :class:`torch.Tensor`, it is expected
     to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
     If img is PIL Image, it is expected to be in mode "L" or "RGB".
 
@@ -391,7 +391,7 @@ class RandomAdjustSharpness(_RandomApplyTransform):
 
     .. betastatus:: RandomAdjustSharpness transform
 
-    If the input is a Tensor,
+    If the input is a :class:`torch.Tensor`,
     it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
 
     Args: