From 339f538394372e854aa19a1bdfc2de2fb4006ea3 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 21 Feb 2023 15:14:13 +0000
Subject: [PATCH 1/3] Added docs for v2 transforms (part 1)

---
 docs/source/conf.py                           |   2 +
 docs/source/transforms.rst                    |  40 ++
 torchvision/transforms/v2/_augment.py         |  31 ++
 torchvision/transforms/v2/_auto_augment.py    |  80 ++++
 torchvision/transforms/v2/_color.py           | 139 +++++++
 torchvision/transforms/v2/_container.py       |  65 ++++
 torchvision/transforms/v2/_deprecated.py      |  25 ++
 torchvision/transforms/v2/_geometry.py        | 367 +++++++++++++++++-
 torchvision/transforms/v2/_meta.py            |  21 +
 torchvision/transforms/v2/_misc.py            |  69 ++++
 torchvision/transforms/v2/_type_conversion.py |  30 ++
 11 files changed, 848 insertions(+), 21 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 72c83d7893d..304a1cc6e22 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -33,6 +33,8 @@
 
 sys.path.append(os.path.abspath("."))
 
+torchvision.disable_beta_transforms_warning()
+
 # -- General configuration ------------------------------------------------
 
 # Required version of sphinx is set from docs/requirements.txt
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index d831b81e37f..00d929d0675 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -98,17 +98,29 @@ Geometry
     :template: class.rst
 
     Resize
+    v2.Resize
     RandomCrop
+    v2.RandomCrop
     RandomResizedCrop
+    v2.RandomResizedCrop
     CenterCrop
+    v2.CenterCrop
     FiveCrop
+    v2.FiveCrop
     TenCrop
+    v2.TenCrop
     Pad
+    v2.Pad
     RandomAffine
+    v2.RandomAffine
     RandomPerspective
+    v2.RandomPerspective
     RandomRotation
+    v2.RandomRotation
     RandomHorizontalFlip
+    v2.RandomHorizontalFlip
     RandomVerticalFlip
+    v2.RandomVerticalFlip
 
 Color
 -----
@@ -118,15 +130,25 @@ Color
     :template: class.rst
 
     ColorJitter
+    v2.ColorJitter
     Grayscale
+    v2.Grayscale
     RandomGrayscale
+    v2.RandomGrayscale
     GaussianBlur
+    v2.GaussianBlur
     RandomInvert
+    v2.RandomInvert
     RandomPosterize
+    v2.RandomPosterize
     RandomSolarize
+    v2.RandomSolarize
     RandomAdjustSharpness
+    v2.RandomAdjustSharpness
     RandomAutocontrast
+    v2.RandomAutocontrast
     RandomEqualize
+    v2.RandomEqualize
 
 Composition
 -----------
@@ -136,9 +158,13 @@ Composition
     :template: class.rst
 
     Compose
+    v2.Compose
     RandomApply
+    v2.RandomApply
     RandomChoice
+    v2.RandomChoice
     RandomOrder
+    v2.RandomOrder
 
 Miscellaneous
 -------------
@@ -148,9 +174,13 @@ Miscellaneous
     :template: class.rst
 
     LinearTransformation
+    v2.LinearTransformation
     Normalize
+    v2.Normalize
     RandomErasing
+    v2.RandomErasing
     Lambda
+    v2.Lambda
 
 .. _conversion_transforms:
 
@@ -162,9 +192,15 @@ Conversion
     :template: class.rst
 
     ToPILImage
+    v2.ToPILImage
+    v2.ToImagePIL
     ToTensor
+    v2.ToTensor
     PILToTensor
+    v2.PILToTensor
     ConvertImageDtype
+    v2.ConvertImageDtype
+    v2.ConvertDtype
 
 Auto-Augmentation
 -----------------
@@ -181,9 +217,13 @@ The new transform can be used standalone or mixed-and-matched with existing tran
 
     AutoAugmentPolicy
     AutoAugment
+    v2.AutoAugment
     RandAugment
+    v2.RandAugment
     TrivialAugmentWide
+    v2.TrivialAugmentWide
     AugMix
+    v2.AugMix
 
 .. _functional_transforms:
 
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 157605d6f3c..405fac15910 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -12,7 +12,38 @@
 from .utils import is_simple_tensor, query_chw
 
 
+# TODO: Just move that to _misc.py?
 class RandomErasing(_RandomApplyTransform):
+    """[BETA] Randomly selects a rectangle region in a torch.Tensor image and erases its pixels.
+
+    .. betastatus:: RandomErasing transform
+
+    This transform does not support PIL Image.
+    'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
+
+    Args:
+         p: probability that the random erasing operation will be performed.
+         scale: range of proportion of erased area against input image.
+         ratio: range of aspect ratio of erased area.
+         value: erasing value. Default is 0. If a single int, it is used to
+            erase all pixels. If a tuple of length 3, it is used to erase
+            R, G, B channels respectively.
+            If a str of 'random', erasing each pixel with random values.
+         inplace: boolean to make this transform inplace. Default set to False.
+
+    Returns:
+        Erased Image.
+
+    Example:
+        >>> transform = transforms.Compose([
+        >>>   transforms.RandomHorizontalFlip(),
+        >>>   transforms.PILToTensor(),
+        >>>   transforms.ConvertImageDtype(torch.float),
+        >>>   transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>>   transforms.RandomErasing(),
+        >>> ])
+    """
+
     _v1_transform_cls = _transforms.RandomErasing
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
index b4791755dc5..98e23b99796 100644
--- a/torchvision/transforms/v2/_auto_augment.py
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -162,6 +162,24 @@ def _apply_image_or_video_transform(
 
 
 class AutoAugment(_AutoAugmentBase):
+    r"""[BETA] AutoAugment data augmentation method based on
+    `"AutoAugment: Learning Augmentation Strategies from Data" <https://arxiv.org/pdf/1805.09501.pdf>`_.
+
+    .. betastatus:: AutoAugment transform
+
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        policy (AutoAugmentPolicy): Desired policy enum defined by
+            :class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
     _v1_transform_cls = _transforms.AutoAugment
 
     _AUGMENTATION_SPACE = {
@@ -318,6 +336,27 @@ def forward(self, *inputs: Any) -> Any:
 
 
 class RandAugment(_AutoAugmentBase):
+    r"""[BETA] RandAugment data augmentation method based on
+    `"RandAugment: Practical automated data augmentation with a reduced search space"
+    <https://arxiv.org/abs/1909.13719>`_.
+
+    .. betastatus:: RandAugment transform
+
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        num_ops (int): Number of augmentation transformations to apply sequentially.
+        magnitude (int): Magnitude for all the transformations.
+        num_magnitude_bins (int): The number of different magnitude values.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
     _v1_transform_cls = _transforms.RandAugment
     _AUGMENTATION_SPACE = {
         "Identity": (lambda num_bins, height, width: None, False),
@@ -379,6 +418,24 @@ def forward(self, *inputs: Any) -> Any:
 
 
 class TrivialAugmentWide(_AutoAugmentBase):
+    r"""[BETA] Dataset-independent data-augmentation with TrivialAugment Wide, as described in
+    `"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" <https://arxiv.org/abs/2103.10158>`_.
+
+    .. betastatus:: TrivialAugmentWide transform
+
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        num_magnitude_bins (int): The number of different magnitude values.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
     _v1_transform_cls = _transforms.TrivialAugmentWide
     _AUGMENTATION_SPACE = {
         "Identity": (lambda num_bins, height, width: None, False),
@@ -430,6 +487,29 @@ def forward(self, *inputs: Any) -> Any:
 
 
 class AugMix(_AutoAugmentBase):
+    r"""[BETA] AugMix data augmentation method based on
+    `"AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty" <https://arxiv.org/abs/1912.02781>`_.
+
+    .. betastatus:: AugMix transform
+
+    If the image is torch Tensor, it should be of type torch.uint8, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        severity (int): The severity of base augmentation operators. Default is ``3``.
+        mixture_width (int): The number of augmentation chains. Default is ``3``.
+        chain_depth (int): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3].
+            Default is ``-1``.
+        alpha (float): The hyperparameter for the probability distributions. Default is ``1.0``.
+        all_ops (bool): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
     _v1_transform_cls = _transforms.AugMix
 
     _PARTIAL_AUGMENTATION_SPACE = {
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 64796e16ca4..526c9661991 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -11,6 +11,23 @@
 
 
 class Grayscale(Transform):
+    """[BETA] Convert image to grayscale.
+
+    .. betastatus:: Grayscale transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    Args:
+        num_output_channels (int): (1 or 3) number of channels desired for output image
+
+    Returns:
+        PIL Image: Grayscale version of the input.
+
+        - If ``num_output_channels == 1`` : returned image is single channel
+        - If ``num_output_channels == 3`` : returned image is 3 channel with r == g == b
+    """
+
     _v1_transform_cls = _transforms.Grayscale
 
     _transformed_types = (
@@ -29,6 +46,24 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomGrayscale(_RandomApplyTransform):
+    """[BETA] Randomly convert image to grayscale with a probability of p (default 0.1).
+
+    .. betastatus:: RandomGrayscale transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., 3, H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    Args:
+        p (float): probability that image should be converted to grayscale.
+
+    Returns:
+        PIL Image or Tensor: Grayscale version of the input image with probability p and unchanged
+        with probability (1-p).
+        - If input image is 1 channel: grayscale version is 1 channel
+        - If input image is 3 channel: grayscale version is 3 channel with r == g == b
+
+    """
+
     _v1_transform_cls = _transforms.RandomGrayscale
 
     _transformed_types = (
@@ -50,6 +85,32 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class ColorJitter(Transform):
+    """[BETA] Randomly change the brightness, contrast, saturation and hue of an image.
+
+    .. betastatus:: ColorJitter transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported.
+
+    Args:
+        brightness (float or tuple of float (min, max)): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+            or the given [min, max]. Should be non negative numbers.
+        contrast (float or tuple of float (min, max)): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+            or the given [min, max]. Should be non-negative numbers.
+        saturation (float or tuple of float (min, max)): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+            or the given [min, max]. Should be non negative numbers.
+        hue (float or tuple of float (min, max)): How much to jitter hue.
+            hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
+            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+            To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
+            thus it does not work if you normalize your image to an interval with negative values,
+            or use an interpolation that generates negative values before using this function.
+    """
+
     _v1_transform_cls = _transforms.ColorJitter
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
@@ -205,6 +266,18 @@ def _transform(
 
 
 class RandomEqualize(_RandomApplyTransform):
+    """[BETA] Equalize the histogram of the given image randomly with a given probability.
+
+    .. betastatus:: RandomEqualize transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "P", "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being equalized. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomEqualize
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -212,6 +285,18 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomInvert(_RandomApplyTransform):
+    """[BETA] Inverts the colors of the given image randomly with a given probability.
+
+    .. betastatus:: RandomInvert transform
+
+    If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being color inverted. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomInvert
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -219,6 +304,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomPosterize(_RandomApplyTransform):
+    """[BETA] Posterize the image randomly with a given probability by reducing the
+    number of bits for each color channel.
+
+    .. betastatus:: RandomPosterize transform
+
+    If the image is torch Tensor, it should be of type torch.uint8,
+    and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        bits (int): number of bits to keep for each channel (0-8)
+        p (float): probability of the image being posterized. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomPosterize
 
     def __init__(self, bits: int, p: float = 0.5) -> None:
@@ -230,6 +329,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomSolarize(_RandomApplyTransform):
+    """[BETA] Solarize the image randomly with a given probability by inverting all pixel
+    values above a threshold.
+
+    .. betastatus:: RandomSolarize transform
+
+    If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        threshold (float): all pixels equal or above this value are inverted.
+        p (float): probability of the image being solarized. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomSolarize
 
     def __init__(self, threshold: float, p: float = 0.5) -> None:
@@ -241,6 +354,18 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomAutocontrast(_RandomApplyTransform):
+    """[BETA] Autocontrast the pixels of the given image randomly with a given probability.
+
+    .. betastatus:: RandomAutocontrast transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being autocontrasted. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomAutocontrast
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -248,6 +373,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomAdjustSharpness(_RandomApplyTransform):
+    """[BETA] Adjust the sharpness of the image randomly with a given probability.
+
+    .. betastatus:: RandomAdjustSharpness transform
+
+    If the image is torch Tensor,
+    it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        sharpness_factor (float):  How much to adjust the sharpness. Can be
+            any non-negative number. 0 gives a blurred image, 1 gives the
+            original image while 2 increases the sharpness by a factor of 2.
+        p (float): probability of the image being sharpened. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomAdjustSharpness
 
     def __init__(self, sharpness_factor: float, p: float = 0.5) -> None:
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
index 555010fda1e..66da9c187c0 100644
--- a/torchvision/transforms/v2/_container.py
+++ b/torchvision/transforms/v2/_container.py
@@ -9,6 +9,37 @@
 
 
 class Compose(Transform):
+    """[BETA] Composes several transforms together.
+
+    .. betastatus:: Compose transform
+
+    This transform does not support torchscript.
+    Please, see the note below.
+
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.PILToTensor(),
+        >>>     transforms.ConvertImageDtype(torch.float),
+        >>> ])
+
+    .. note::
+        In order to script the transformations, please use ``torch.nn.Sequential`` as below.
+
+        >>> transforms = torch.nn.Sequential(
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>> )
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    """
+
     def __init__(self, transforms: Sequence[Callable]) -> None:
         super().__init__()
         if not isinstance(transforms, Sequence):
@@ -29,6 +60,27 @@ def extra_repr(self) -> str:
 
 
 class RandomApply(Transform):
+    """[BETA] Apply randomly a list of transformations with a given probability.
+
+    .. betastatus:: RandomApply transform
+
+    .. note::
+        In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of
+        transforms as shown below:
+
+        >>> transforms = transforms.RandomApply(torch.nn.ModuleList([
+        >>>     transforms.ColorJitter(),
+        >>> ]), p=0.3)
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+        p (float): probability
+    """
+
     _v1_transform_cls = _transforms.RandomApply
 
     def __init__(self, transforms: Union[Sequence[Callable], nn.ModuleList], p: float = 0.5) -> None:
@@ -63,6 +115,12 @@ def extra_repr(self) -> str:
 
 
 class RandomChoice(Transform):
+    """[BETA] Apply single transformation randomly picked from a list.
+
+    .. betastatus:: RandomChoice transform
+
+    This transform does not support torchscript."""
+
     def __init__(
         self,
         transforms: Sequence[Callable],
@@ -99,6 +157,13 @@ def forward(self, *inputs: Any) -> Any:
 
 
 class RandomOrder(Transform):
+    """[BETA] Apply a list of transformations in a random order.
+
+    .. betastatus:: RandomOrder transform
+
+    This transform does not support torchscript.
+    """
+
     def __init__(self, transforms: Sequence[Callable]) -> None:
         if not isinstance(transforms, Sequence):
             raise TypeError("Argument transforms should be a sequence of callables")
diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py
index bfb0d06239f..29f2017f038 100644
--- a/torchvision/transforms/v2/_deprecated.py
+++ b/torchvision/transforms/v2/_deprecated.py
@@ -10,6 +10,31 @@
 
 
 class ToTensor(Transform):
+    """[BETA] Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor.
+
+    .. betastatus:: ToTensor transform
+
+    .. warning::
+        v2.ToTensor is deprecated and will be removed in a future release.
+        Please use instead ``transforms.Compose([transforms.ToImageTensor(), transforms.ConvertImageDtype()])``.
+
+    This transform does not support torchscript.
+
+
+    Converts a PIL Image or numpy.ndarray (H x W x C) in the range
+    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
+    if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1)
+    or if the numpy.ndarray has dtype = np.uint8
+
+    In the other cases, tensors are returned without scaling.
+
+    .. note::
+        Because the input image is scaled to [0.0, 1.0], this transformation should not be used when
+        transforming target image masks. See the `references`_ for implementing the transforms for image masks.
+
+    .. _references: https://github.com/pytorch/vision/tree/main/references/segmentation
+    """
+
     _transformed_types = (PIL.Image.Image, np.ndarray)
 
     def __init__(self) -> None:
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index f1eed87b9c0..322b317cac5 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -26,6 +26,18 @@
 
 
 class RandomHorizontalFlip(_RandomApplyTransform):
+    """[BETA] Horizontally flip the given image/box/mask randomly with a given probability.
+
+    .. betastatus:: RandomHorizontalFlip transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions
+
+    Args:
+        p (float): probability of the image being flipped. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomHorizontalFlip
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -33,6 +45,18 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomVerticalFlip(_RandomApplyTransform):
+    """[BETA] Vertically flip the given image/box/mask randomly with a given probability.
+
+    .. betastatus:: RandomVerticalFlip transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions
+
+    Args:
+        p (float): probability of the image being flipped. Default value is 0.5
+    """
+
     _v1_transform_cls = _transforms.RandomVerticalFlip
 
     def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
@@ -40,6 +64,62 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class Resize(Transform):
+    """[BETA] Resize the input image/box/mask to the given size.
+
+    .. betastatus:: Resize transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    .. warning::
+        The output image might be different depending on its type: when downsampling, the interpolation of PIL images
+        and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences
+        in the performance of a network. Therefore, it is preferable to train and serve a model with the same input
+        types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors
+        closer.
+
+    Args:
+        size (sequence or int): Desired output size. If size is a sequence like
+            (h, w), output size will be matched to this. If size is an int,
+            smaller edge of the image will be matched to this number.
+            i.e, if height > width, then image will be rescaled to
+            (size * height / width, size).
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        max_size (int, optional): The maximum allowed for the longer edge of
+            the resized image: if the longer edge of the image is greater
+            than ``max_size`` after being resized according to ``size``, then
+            the image is resized again so that the longer edge is equal to
+            ``max_size``. As a result, ``size`` might be overruled, i.e. the
+            smaller edge may be shorter than ``size``. This is only supported
+            if ``size`` is an int (or a sequence of length 1 in torchscript
+            mode).
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     _v1_transform_cls = _transforms.Resize
 
     def __init__(
@@ -76,6 +156,20 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class CenterCrop(Transform):
+    """[BETA] Crops the given image/box/mask at the center.
+
+    .. betastatus:: CenterCrop transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+    """
+
     _v1_transform_cls = _transforms.CenterCrop
 
     def __init__(self, size: Union[int, Sequence[int]]):
@@ -87,6 +181,53 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomResizedCrop(Transform):
+    """[BETA] Crop a random portion of image/box/mask and resize it to a given size.
+
+    .. betastatus:: RandomResizedCrop transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    A crop of the original image is made: the crop has a random area (H * W)
+    and a random aspect ratio. This crop is finally resized to the given
+    size. This is popularly used to train the Inception networks.
+
+    Args:
+        size (int or sequence): expected output size of the crop, for each edge. If size is an
+            int instead of sequence like (h, w), a square output size ``(size, size)`` is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        scale (tuple of float): Specifies the lower and upper bounds for the random area of the crop,
+            before resizing. The scale is defined with respect to the area of the original image.
+        ratio (tuple of float): lower and upper bounds for the random aspect ratio of the crop, before
+            resizing.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
     _v1_transform_cls = _transforms.RandomResizedCrop
 
     def __init__(
@@ -164,25 +305,23 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class FiveCrop(Transform):
-    """
-    Example:
-        >>> class BatchMultiCrop(transforms.Transform):
-        ...     def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], int]):
-        ...         images_or_videos, labels = sample
-        ...         batch_size = len(images_or_videos)
-        ...         image_or_video = images_or_videos[0]
-        ...         images_or_videos = image_or_video.wrap_like(image_or_video, torch.stack(images_or_videos))
-        ...         labels = torch.full((batch_size,), label, device=images_or_videos.device)
-        ...         return images_or_videos, labels
-        ...
-        >>> image = datapoints.Image(torch.rand(3, 256, 256))
-        >>> label = 3
-        >>> transform = transforms.Compose([transforms.FiveCrop(224), BatchMultiCrop()])
-        >>> images, labels = transform(image, label)
-        >>> images.shape
-        torch.Size([5, 3, 224, 224])
-        >>> labels
-        tensor([3, 3, 3, 3, 3])
+    """[BETA] Crop the given image/box/mask into four corners and the central crop.
+
+    .. betastatus:: FiveCrop transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions
+
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+
+    Args:
+         size (sequence or int): Desired output size of the crop. If size is an ``int``
+            instead of sequence like (h, w), a square crop of size (size, size) is made.
+            If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
     """
 
     _v1_transform_cls = _transforms.FiveCrop
@@ -209,8 +348,25 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None:
 
 
 class TenCrop(Transform):
-    """
-    See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
+    """[BETA] Crop the given image/box/mask into four corners and the central crop plus the flipped version of
+    these (horizontal flipping is used by default).
+
+    .. betastatus:: TenCrop transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading
+    dimensions
+
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        vertical_flip (bool): Use vertical flipping instead of horizontal
     """
 
     _v1_transform_cls = _transforms.TenCrop
@@ -249,6 +405,46 @@ def _transform(
 
 
 class Pad(Transform):
+    """[BETA] Pad the given image/box/mask on all sides with the given "pad" value.
+
+    .. betastatus:: Pad transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means at most 2 leading dimensions for mode reflect and symmetric,
+    at most 3 leading dimensions for mode edge,
+    and an arbitrary number of leading dimensions for mode constant
+
+    Args:
+        padding (int or sequence): Padding on each border. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant.
+            Only number is supported for torch Tensor.
+            Only int or tuple value is supported for PIL Image.
+        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is constant.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+              If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
     _v1_transform_cls = _transforms.Pad
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
@@ -323,6 +519,34 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomRotation(Transform):
+    """[BETA] Rotate the image/box/mask by angle.
+
+    .. betastatus:: RandomRotation transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        degrees (sequence or number): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees).
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        expand (bool, optional): Optional expansion flag.
+            If true, expands the output to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center and no translation.
+        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
+            Default is the center of the image.
+        fill (sequence or number): Pixel fill value for the area outside the rotated
+            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+
     _v1_transform_cls = _transforms.RandomRotation
 
     def __init__(
@@ -363,6 +587,42 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomAffine(Transform):
+    """[BETA] Random affine transformation of the image/box/mask keeping center invariant.
+
+    .. betastatus:: RandomAffine transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        degrees (sequence or number): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees). Set to 0 to deactivate rotations.
+        translate (tuple, optional): tuple of maximum absolute fraction for horizontal
+            and vertical translations. For example translate=(a, b), then horizontal shift
+            is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is
+            randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default.
+        scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
+            randomly sampled from the range a <= scale <= b. Will keep original scale by default.
+        shear (sequence or number, optional): Range of degrees to select from.
+            If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear)
+            will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the
+            range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
+            an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
+            Will not apply shear by default.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (sequence or number): Pixel fill value for the area outside the transformed
+            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
+            Default is the center of the image.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+
     _v1_transform_cls = _transforms.RandomAffine
 
     def __init__(
@@ -443,6 +703,52 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomCrop(Transform):
+    """[BETA] Crop the given image/box/mask at a random location.
+
+    .. betastatus:: RandomCrop transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions,
+    but if non-constant padding is used, the input is expected to have at most 2 leading dimensions
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        padding (int or sequence, optional): Optional padding on each border
+            of the image. Default is None. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        pad_if_needed (boolean): It will pad the image if smaller than the
+            desired size to avoid raising an exception. Since cropping is done
+            after padding, the padding seems to be done at a random offset.
+        fill (number or tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant.
+            Only number is supported for torch Tensor.
+            Only int or tuple value is supported for PIL Image.
+        padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is constant.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+              If input a 5D torch Tensor, the last 3 dimensions will be padded instead of the last 2
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
     _v1_transform_cls = _transforms.RandomCrop
 
     def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
@@ -552,6 +858,25 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class RandomPerspective(_RandomApplyTransform):
+    """[BETA] Performs a random perspective transformation of the given image/box/mask with a given probability.
+
+    .. betastatus:: RandomPerspective transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        distortion_scale (float): argument to control the degree of distortion and ranges from 0 to 1.
+            Default is 0.5.
+        p (float): probability of the image being transformed. Default is 0.5.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (sequence or number): Pixel fill value for the area outside the transformed
+            image. Default is ``0``. If given a number, the value is used for all bands respectively.
+    """
+
     _v1_transform_cls = _transforms.RandomPerspective
 
     def __init__(
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
index 0d1544094ca..7d0f0ec39f9 100644
--- a/torchvision/transforms/v2/_meta.py
+++ b/torchvision/transforms/v2/_meta.py
@@ -22,6 +22,27 @@ def _transform(self, inpt: datapoints.BoundingBox, params: Dict[str, Any]) -> da
 
 
 class ConvertDtype(Transform):
+    """[BETA] Convert a tensor image/box/mask to the given ``dtype`` and scale the values accordingly
+
+    .. betastatus:: ConvertDtype transform
+
+    This function does not support PIL Image.
+
+    Args:
+        dtype (torch.dtype): Desired data type of the output
+
+    .. note::
+
+        When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly.
+        If converted back and forth, this mismatch has no effect.
+
+    Raises:
+        RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as
+            well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to
+            overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range
+            of the integer ``dtype``.
+    """
+
     _v1_transform_cls = _transforms.ConvertImageDtype
 
     _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 6dd0755cfbb..4c260f97b13 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -21,6 +21,16 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class Lambda(Transform):
+    """[BETA] Apply a user-defined lambda as a transform.
+
+    .. betastatus:: Lambda transform
+
+    This transform does not support torchscript.
+
+    Args:
+        lambd (function): Lambda/function to be used for transform.
+    """
+
     def __init__(self, lambd: Callable[[Any], Any], *types: Type):
         super().__init__()
         self.lambd = lambd
@@ -42,6 +52,26 @@ def extra_repr(self) -> str:
 
 
 class LinearTransformation(Transform):
+    """[BETA] Transform a tensor image with a square transformation matrix and a mean_vector computed offline.
+
+    .. betastatus:: LinearTransformation transform
+
+    This transform does not support PIL Image.
+    Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and
+    subtract mean_vector from it which is then followed by computing the dot
+    product with the transformation matrix and then reshaping the tensor to its
+    original shape.
+
+    Applications:
+        whitening transformation: Suppose X is a column vector zero-centered data.
+        Then compute the data covariance matrix [D x D] with torch.mm(X.t(), X),
+        perform SVD on this matrix and pass it as transformation_matrix.
+
+    Args:
+        transformation_matrix (Tensor): tensor [D x D], D = C x H x W
+        mean_vector (Tensor): tensor [D], D = C x H x W
+    """
+
     _v1_transform_cls = _transforms.LinearTransformation
 
     _transformed_types = (is_simple_tensor, datapoints.Image, datapoints.Video)
@@ -105,6 +135,26 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
 
 
 class Normalize(Transform):
+    """[BETA] Normalize a tensor image with mean and standard deviation.
+
+    .. betastatus:: Normalize transform
+
+    This transform does not support PIL Image.
+    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
+    channels, this transform will normalize each channel of the input
+    ``torch.*Tensor`` i.e.,
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    .. note::
+        This transform acts out of place, i.e., it does not mutate the input tensor.
+
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        inplace(bool,optional): Bool to make this operation in-place.
+
+    """
+
     _v1_transform_cls = _transforms.Normalize
     _transformed_types = (datapoints.Image, is_simple_tensor, datapoints.Video)
 
@@ -124,7 +174,26 @@ def _transform(
         return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace)
 
 
+# TODO: This should be in the _color.py file!
 class GaussianBlur(Transform):
+    """[BETA] Blurs image with randomly chosen Gaussian blur.
+
+    .. betastatus:: GausssianBlur transform
+
+    If the image is torch Tensor, it is expected
+    to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        kernel_size (int or sequence): Size of the Gaussian kernel.
+        sigma (float or tuple of float (min, max)): Standard deviation to be used for
+            creating kernel to perform blurring. If float, sigma is fixed. If it is tuple
+            of float (min, max), sigma is chosen uniformly at random to lie in the
+            given range.
+
+    Returns:
+        PIL Image or Tensor: Gaussian blurred version of the input image.
+    """
+
     _v1_transform_cls = _transforms.GaussianBlur
 
     def __init__(
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
index 984d5ba50c0..b0743feb10d 100644
--- a/torchvision/transforms/v2/_type_conversion.py
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -11,6 +11,15 @@
 
 
 class PILToTensor(Transform):
+    """[BETA] Convert a ``PIL Image`` to a tensor of the same type.
+
+    .. betastatus:: PILToTensor transform
+
+    This transform does not support torchscript.
+
+    Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W).
+    """
+
     _transformed_types = (PIL.Image.Image,)
 
     def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Tensor:
@@ -27,6 +36,27 @@ def _transform(
 
 
 class ToImagePIL(Transform):
+    """[BETA] Convert a tensor or an ndarray to PIL Image.
+
+    .. betastatus:: ToImagePIL transform
+
+    This transform does not support torchscript.
+
+    Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
+    H x W x C to a PIL Image while preserving the value range.
+
+    Args:
+        mode (`PIL.Image mode`_): color space and pixel depth of input data (optional).
+            If ``mode`` is ``None`` (default) there are some assumptions made about the input data:
+            - If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``.
+            - If the input has 3 channels, the ``mode`` is assumed to be ``RGB``.
+            - If the input has 2 channels, the ``mode`` is assumed to be ``LA``.
+            - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``,
+            ``short``).
+
+    .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
+    """
+
     _transformed_types = (is_simple_tensor, datapoints.Image, np.ndarray)
 
     def __init__(self, mode: Optional[str] = None) -> None:

From 11d295cc9609718173806c0db647f279c034f531 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 21 Feb 2023 16:31:41 +0000
Subject: [PATCH 2/3] Apply suggestions from code review

Co-authored-by: vfdev <vfdev.5@gmail.com>
Co-authored-by: Philip Meier <github.pmeier@posteo.de>
---
 torchvision/transforms/v2/_augment.py    | 6 ++++--
 torchvision/transforms/v2/_color.py      | 2 +-
 torchvision/transforms/v2/_deprecated.py | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 405fac15910..4473b23de6f 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -14,7 +14,7 @@
 
 # TODO: Just move that to _misc.py?
 class RandomErasing(_RandomApplyTransform):
-    """[BETA] Randomly selects a rectangle region in a torch.Tensor image and erases its pixels.
+    """[BETA] Randomly selects a rectangle region in the input image or video and erases its pixels.
 
     .. betastatus:: RandomErasing transform
 
@@ -32,9 +32,11 @@ class RandomErasing(_RandomApplyTransform):
          inplace: boolean to make this transform inplace. Default set to False.
 
     Returns:
-        Erased Image.
+        Erased input.
 
     Example:
+        >>> from torchvision.transforms import v2 as transforms
+        >>>
         >>> transform = transforms.Compose([
         >>>   transforms.RandomHorizontalFlip(),
         >>>   transforms.PILToTensor(),
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
index 526c9661991..785a3965e60 100644
--- a/torchvision/transforms/v2/_color.py
+++ b/torchvision/transforms/v2/_color.py
@@ -11,7 +11,7 @@
 
 
 class Grayscale(Transform):
-    """[BETA] Convert image to grayscale.
+    """[BETA] Convert images or videos to grayscale.
 
     .. betastatus:: Grayscale transform
 
diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py
index 29f2017f038..c44e6b08d11 100644
--- a/torchvision/transforms/v2/_deprecated.py
+++ b/torchvision/transforms/v2/_deprecated.py
@@ -15,7 +15,7 @@ class ToTensor(Transform):
     .. betastatus:: ToTensor transform
 
     .. warning::
-        v2.ToTensor is deprecated and will be removed in a future release.
+        :class:`v2.ToTensor` is deprecated and will be removed in a future release.
         Please use instead ``transforms.Compose([transforms.ToImageTensor(), transforms.ConvertImageDtype()])``.
 
     This transform does not support torchscript.

From 6a414fecfa4eace10942e4ec78724f49c7f30d67 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 21 Feb 2023 16:34:36 +0000
Subject: [PATCH 3/3] Address comments

---
 torchvision/transforms/v2/_augment.py  |  1 -
 torchvision/transforms/v2/_geometry.py | 23 ++++++++++++++++++++++-
 torchvision/transforms/v2/_misc.py     |  1 -
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
index 4473b23de6f..b5aac9ca9a2 100644
--- a/torchvision/transforms/v2/_augment.py
+++ b/torchvision/transforms/v2/_augment.py
@@ -12,7 +12,6 @@
 from .utils import is_simple_tensor, query_chw
 
 
-# TODO: Just move that to _misc.py?
 class RandomErasing(_RandomApplyTransform):
     """[BETA] Randomly selects a rectangle region in the input image or video and erases its pixels.
 
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
index 322b317cac5..af8ca4b6471 100644
--- a/torchvision/transforms/v2/_geometry.py
+++ b/torchvision/transforms/v2/_geometry.py
@@ -322,6 +322,25 @@ class FiveCrop(Transform):
          size (sequence or int): Desired output size of the crop. If size is an ``int``
             instead of sequence like (h, w), a square crop of size (size, size) is made.
             If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
+    Example:
+        >>> class BatchMultiCrop(transforms.Transform):
+        ...     def forward(self, sample: Tuple[Tuple[Union[datapoints.Image, datapoints.Video], ...], int]):
+        ...         images_or_videos, labels = sample
+        ...         batch_size = len(images_or_videos)
+        ...         image_or_video = images_or_videos[0]
+        ...         images_or_videos = image_or_video.wrap_like(image_or_video, torch.stack(images_or_videos))
+        ...         labels = torch.full((batch_size,), label, device=images_or_videos.device)
+        ...         return images_or_videos, labels
+        ...
+        >>> image = datapoints.Image(torch.rand(3, 256, 256))
+        >>> label = 3
+        >>> transform = transforms.Compose([transforms.FiveCrop(224), BatchMultiCrop()])
+        >>> images, labels = transform(image, label)
+        >>> images.shape
+        torch.Size([5, 3, 224, 224])
+        >>> labels
+        tensor([3, 3, 3, 3, 3])
     """
 
     _v1_transform_cls = _transforms.FiveCrop
@@ -355,7 +374,9 @@ class TenCrop(Transform):
 
     If the image is torch Tensor, it is expected
     to have [..., H, W] shape, where ... means an arbitrary number of leading
-    dimensions
+    dimensions.
+
+    See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
 
     .. Note::
          This transform returns a tuple of images and there may be a mismatch in the number of
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
index 4c260f97b13..6998d416c91 100644
--- a/torchvision/transforms/v2/_misc.py
+++ b/torchvision/transforms/v2/_misc.py
@@ -174,7 +174,6 @@ def _transform(
         return F.normalize(inpt, mean=self.mean, std=self.std, inplace=self.inplace)
 
 
-# TODO: This should be in the _color.py file!
 class GaussianBlur(Transform):
     """[BETA] Blurs image with randomly chosen Gaussian blur.