pytorch
diff --git a/‎docs/source/models.rst‎
Lines changed: 8 additions & 0 deletions b/‎docs/source/models.rst‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎test/expect/ModelTester.test_resnext101_64x4d_expect.pkl‎
939 Bytes b/‎test/expect/ModelTester.test_resnext101_64x4d_expect.pkl‎
939 Bytes
diff --git a/‎test/expect/ModelTester.test_resnext101_64x4d_quantized_expect.pkl‎
747 Bytes b/‎test/expect/ModelTester.test_resnext101_64x4d_quantized_expect.pkl‎
747 Bytes
diff --git a/‎test/test_models.py‎
Lines changed: 1 addition & 0 deletions b/‎test/test_models.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/test_onnx.py‎
Lines changed: 2 additions & 1 deletion b/‎test/test_onnx.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎test/test_prototype_transforms_functional.py‎
Lines changed: 191 additions & 5 deletions b/‎test/test_prototype_transforms_functional.py‎
Lines changed: 191 additions & 5 deletions
diff --git a/‎test/test_transforms.py‎
Lines changed: 2 additions & 1 deletion b/‎test/test_transforms.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎test/test_transforms_tensor.py‎
Lines changed: 4 additions & 4 deletions b/‎test/test_transforms_tensor.py‎
Lines changed: 4 additions & 4 deletions
@@ -61,6 +61,8 @@ You can construct a model with random weights by calling its constructor:
     mobilenet_v3_large = models.mobilenet_v3_large()
     mobilenet_v3_small = models.mobilenet_v3_small()
     resnext50_32x4d = models.resnext50_32x4d()
+    resnext101_32x8d = models.resnext101_32x8d()
+    resnext101_64x4d = models.resnext101_64x4d()
     wide_resnet50_2 = models.wide_resnet50_2()
     mnasnet = models.mnasnet1_0()
     efficientnet_b0 = models.efficientnet_b0()
@@ -185,6 +187,7 @@ MobileNet V3 Large                74.042          91.340
 MobileNet V3 Small                67.668          87.402
 ResNeXt-50-32x4d                  77.618          93.698
 ResNeXt-101-32x8d                 79.312          94.526
+ResNeXt-101-64x4d                 83.246          96.454
 Wide ResNet-50-2                  78.468          94.086
 Wide ResNet-101-2                 78.848          94.284
 MNASNet 1.0                       73.456          91.510
@@ -366,6 +369,7 @@ ResNext
 
     resnext50_32x4d
     resnext101_32x8d
+    resnext101_64x4d
 
 Wide ResNet
 -----------
@@ -481,8 +485,11 @@ a model with random weights by calling its constructor:
     resnet18 = models.quantization.resnet18()
     resnet50 = models.quantization.resnet50()
     resnext101_32x8d = models.quantization.resnext101_32x8d()
+    resnext101_64x4d = models.quantization.resnext101_64x4d()
     shufflenet_v2_x0_5 = models.quantization.shufflenet_v2_x0_5()
     shufflenet_v2_x1_0 = models.quantization.shufflenet_v2_x1_0()
+    shufflenet_v2_x1_5 = models.quantization.shufflenet_v2_x1_5()
+    shufflenet_v2_x2_0 = models.quantization.shufflenet_v2_x2_0()
 
 Obtaining a pre-trained quantized model can be done with a few lines of code:
 
@@ -508,6 +515,7 @@ ShuffleNet V2 x2.0                75.354         92.488
 ResNet 18                         69.494         88.882
 ResNet 50                         75.920         92.814
 ResNext 101 32x8d                 78.986         94.480
+ResNext 101 64x4d                 82.898         96.326
 Inception V3                      77.176         93.354
 GoogleNet                         69.826         89.404
 ================================  =============  =============
 
@@ -315,6 +315,7 @@ def _check_input_backprop(model, inputs):
     "convnext_base",
     "convnext_large",
     "resnext101_32x8d",
+    "resnext101_64x4d",
     "wide_resnet101_2",
     "efficientnet_b6",
     "efficientnet_b7",
 
@@ -412,12 +412,13 @@ def forward(self_module, images, features):
     def get_image(self, rel_path: str, size: Tuple[int, int]) -> torch.Tensor:
         import os
 
+        import torchvision.transforms._pil_constants as _pil_constants
         from PIL import Image
         from torchvision.transforms import functional as F
 
         data_dir = os.path.join(os.path.dirname(__file__), "assets")
         path = os.path.join(data_dir, *rel_path.split("/"))
-        image = Image.open(path).convert("RGB").resize(size, Image.BILINEAR)
+        image = Image.open(path).convert("RGB").resize(size, _pil_constants.BILINEAR)
 
         return F.convert_image_dtype(F.pil_to_tensor(image))
 
 
@@ -11,6 +11,7 @@
 from torch.nn.functional import one_hot
 from torchvision.prototype import features
 from torchvision.prototype.transforms.functional._meta import convert_bounding_box_format
+from torchvision.transforms.functional import _get_perspective_coeffs
 from torchvision.transforms.functional_tensor import _max_value as get_max_value
 
 make_tensor = functools.partial(torch.testing.make_tensor, device="cpu")
@@ -380,6 +381,37 @@ def pad_segmentation_mask():
         yield SampleInput(mask, padding=padding, padding_mode=padding_mode)
 
 
+@register_kernel_info_from_sample_inputs_fn
+def perspective_bounding_box():
+    for bounding_box, perspective_coeffs in itertools.product(
+        make_bounding_boxes(),
+        [
+            [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018],
+            [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063],
+        ],
+    ):
+        yield SampleInput(
+            bounding_box,
+            format=bounding_box.format,
+            perspective_coeffs=perspective_coeffs,
+        )
+
+
+@register_kernel_info_from_sample_inputs_fn
+def perspective_segmentation_mask():
+    for mask, perspective_coeffs in itertools.product(
+        make_segmentation_masks(extra_dims=((), (4,))),
+        [
+            [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018],
+            [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063],
+        ],
+    ):
+        yield SampleInput(
+            mask,
+            perspective_coeffs=perspective_coeffs,
+        )
+
+
 @register_kernel_info_from_sample_inputs_fn
 def center_crop_bounding_box():
     for bounding_box, output_size in itertools.product(make_bounding_boxes(), [(24, 12), [16, 18], [46, 48], [12]]):
@@ -993,7 +1025,7 @@ def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device):
     ],
 )
 def test_correctness_resized_crop_bounding_box(device, format, top, left, height, width, size):
-    def _compute_expected(bbox, top_, left_, height_, width_, size_):
+    def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_):
         # bbox should be xyxy
         bbox[0] = (bbox[0] - left_) * size_[1] / width_
         bbox[1] = (bbox[1] - top_) * size_[0] / height_
@@ -1009,7 +1041,7 @@ def _compute_expected(bbox, top_, left_, height_, width_, size_):
     ]
     expected_bboxes = []
     for in_box in in_boxes:
-        expected_bboxes.append(_compute_expected(list(in_box), top, left, height, width, size))
+        expected_bboxes.append(_compute_expected_bbox(list(in_box), top, left, height, width, size))
     expected_bboxes = torch.tensor(expected_bboxes, device=device)
 
     in_boxes = features.BoundingBox(
@@ -1035,7 +1067,7 @@ def _compute_expected(bbox, top_, left_, height_, width_, size_):
     ],
 )
 def test_correctness_resized_crop_segmentation_mask(device, top, left, height, width, size):
-    def _compute_expected(mask, top_, left_, height_, width_, size_):
+    def _compute_expected_mask(mask, top_, left_, height_, width_, size_):
         output = mask.clone()
         output = output[:, top_ : top_ + height_, left_ : left_ + width_]
         output = torch.nn.functional.interpolate(output[None, :].float(), size=size_, mode="nearest")
@@ -1046,7 +1078,7 @@ def _compute_expected(mask, top_, left_, height_, width_, size_):
     in_mask[0, 10:20, 10:20] = 1
     in_mask[0, 5:15, 12:23] = 2
 
-    expected_mask = _compute_expected(in_mask, top, left, height, width, size)
+    expected_mask = _compute_expected_mask(in_mask, top, left, height, width, size)
     output_mask = F.resized_crop_segmentation_mask(in_mask, top, left, height, width, size)
     torch.testing.assert_close(output_mask, expected_mask)
 
@@ -1095,6 +1127,161 @@ def parse_padding():
         torch.testing.assert_close(out_mask, expected_mask)
 
 
+@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize(
+    "startpoints, endpoints",
+    [
+        [[[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]],
+        [[[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]],
+        [[[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]],
+    ],
+)
+def test_correctness_perspective_bounding_box(device, startpoints, endpoints):
+    def _compute_expected_bbox(bbox, pcoeffs_):
+        m1 = np.array(
+            [
+                [pcoeffs_[0], pcoeffs_[1], pcoeffs_[2]],
+                [pcoeffs_[3], pcoeffs_[4], pcoeffs_[5]],
+            ]
+        )
+        m2 = np.array(
+            [
+                [pcoeffs_[6], pcoeffs_[7], 1.0],
+                [pcoeffs_[6], pcoeffs_[7], 1.0],
+            ]
+        )
+
+        bbox_xyxy = convert_bounding_box_format(
+            bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY
+        )
+        points = np.array(
+            [
+                [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
+                [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0],
+                [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0],
+                [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
+            ]
+        )
+        numer = np.matmul(points, m1.T)
+        denom = np.matmul(points, m2.T)
+        transformed_points = numer / denom
+        out_bbox = [
+            np.min(transformed_points[:, 0]),
+            np.min(transformed_points[:, 1]),
+            np.max(transformed_points[:, 0]),
+            np.max(transformed_points[:, 1]),
+        ]
+        out_bbox = features.BoundingBox(
+            out_bbox,
+            format=features.BoundingBoxFormat.XYXY,
+            image_size=bbox.image_size,
+            dtype=torch.float32,
+            device=bbox.device,
+        )
+        return convert_bounding_box_format(
+            out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False
+        )
+
+    image_size = (32, 38)
+
+    pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
+    inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints)
+
+    for bboxes in make_bounding_boxes(
+        image_sizes=[
+            image_size,
+        ],
+        extra_dims=((4,),),
+    ):
+        bboxes = bboxes.to(device)
+        bboxes_format = bboxes.format
+        bboxes_image_size = bboxes.image_size
+
+        output_bboxes = F.perspective_bounding_box(
+            bboxes,
+            bboxes_format,
+            perspective_coeffs=pcoeffs,
+        )
+
+        if bboxes.ndim < 2:
+            bboxes = [bboxes]
+
+        expected_bboxes = []
+        for bbox in bboxes:
+            bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size)
+            expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs))
+        if len(expected_bboxes) > 1:
+            expected_bboxes = torch.stack(expected_bboxes)
+        else:
+            expected_bboxes = expected_bboxes[0]
+        torch.testing.assert_close(output_bboxes, expected_bboxes, rtol=1e-5, atol=1e-5)
+
+
+@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize(
+    "startpoints, endpoints",
+    [
+        [[[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]],
+        [[[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]],
+        [[[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]],
+    ],
+)
+def test_correctness_perspective_segmentation_mask(device, startpoints, endpoints):
+    def _compute_expected_mask(mask, pcoeffs_):
+        assert mask.ndim == 3 and mask.shape[0] == 1
+        m1 = np.array(
+            [
+                [pcoeffs_[0], pcoeffs_[1], pcoeffs_[2]],
+                [pcoeffs_[3], pcoeffs_[4], pcoeffs_[5]],
+            ]
+        )
+        m2 = np.array(
+            [
+                [pcoeffs_[6], pcoeffs_[7], 1.0],
+                [pcoeffs_[6], pcoeffs_[7], 1.0],
+            ]
+        )
+
+        expected_mask = torch.zeros_like(mask.cpu())
+        for out_y in range(expected_mask.shape[1]):
+            for out_x in range(expected_mask.shape[2]):
+                output_pt = np.array([out_x + 0.5, out_y + 0.5, 1.0])
+
+                numer = np.matmul(output_pt, m1.T)
+                denom = np.matmul(output_pt, m2.T)
+                input_pt = np.floor(numer / denom).astype(np.int32)
+
+                in_x, in_y = input_pt[:2]
+                if 0 <= in_x < mask.shape[2] and 0 <= in_y < mask.shape[1]:
+                    expected_mask[0, out_y, out_x] = mask[0, in_y, in_x]
+        return expected_mask.to(mask.device)
+
+    pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
+
+    for mask in make_segmentation_masks(extra_dims=((), (4,))):
+        mask = mask.to(device)
+
+        output_mask = F.perspective_segmentation_mask(
+            mask,
+            perspective_coeffs=pcoeffs,
+        )
+
+        if mask.ndim < 4:
+            masks = [mask]
+        else:
+            masks = [m for m in mask]
+
+        expected_masks = []
+        for mask in masks:
+            expected_mask = _compute_expected_mask(mask, pcoeffs)
+            expected_masks.append(expected_mask)
+        if len(expected_masks) > 1:
+            expected_masks = torch.stack(expected_masks)
+        else:
+            expected_masks = expected_masks[0]
+        torch.testing.assert_close(output_mask, expected_masks)
+
+
 @pytest.mark.parametrize("device", cpu_and_gpu())
 @pytest.mark.parametrize(
     "output_size",
@@ -1148,5 +1335,4 @@ def _compute_expected_bbox(bbox, output_size_):
             expected_bboxes = torch.stack(expected_bboxes)
         else:
             expected_bboxes = expected_bboxes[0]
-        expected_bboxes = expected_bboxes.to(device=device)
         torch.testing.assert_close(output_boxes, expected_bboxes)
@@ -8,6 +8,7 @@
 import pytest
 import torch
 import torchvision.transforms as transforms
+import torchvision.transforms._pil_constants as _pil_constants
 import torchvision.transforms.functional as F
 import torchvision.transforms.functional_tensor as F_t
 from PIL import Image
@@ -173,7 +174,7 @@ def test_accimage_pil_to_tensor(self):
     def test_accimage_resize(self):
         trans = transforms.Compose(
             [
-                transforms.Resize(256, interpolation=Image.LINEAR),
+                transforms.Resize(256, interpolation=_pil_constants.LINEAR),
                 transforms.PILToTensor(),
                 transforms.ConvertImageDtype(dtype=torch.float),
             ]
 
@@ -4,6 +4,7 @@
 import numpy as np
 import pytest
 import torch
+import torchvision.transforms._pil_constants as _pil_constants
 from common_utils import (
     get_tmp_dir,
     int_dtypes,
@@ -15,7 +16,6 @@
     cpu_and_gpu,
     assert_equal,
 )
-from PIL import Image
 from torchvision import transforms as T
 from torchvision.transforms import InterpolationMode
 from torchvision.transforms import functional as F
@@ -771,13 +771,13 @@ def shear(pil_img, level, mode, resample):
             matrix = (1, level, 0, 0, 1, 0)
         elif mode == "Y":
             matrix = (1, 0, 0, level, 1, 0)
-        return pil_img.transform((image_size, image_size), Image.AFFINE, matrix, resample=resample)
+        return pil_img.transform((image_size, image_size), _pil_constants.AFFINE, matrix, resample=resample)
 
     t_img, pil_img = _create_data(image_size, image_size)
 
     resample_pil = {
-        F.InterpolationMode.NEAREST: Image.NEAREST,
-        F.InterpolationMode.BILINEAR: Image.BILINEAR,
+        F.InterpolationMode.NEAREST: _pil_constants.NEAREST,
+        F.InterpolationMode.BILINEAR: _pil_constants.BILINEAR,
     }[interpolation]
 
     level = 0.3