From 5106cfc7260803660f8dfbd7ad51807972c50722 Mon Sep 17 00:00:00 2001 From: zyan3 Date: Fri, 6 Sep 2019 22:51:47 -0700 Subject: [PATCH 1/5] video transforms --- test/test_transforms.py | 2 +- test/test_transforms_video.py | 173 +++++++++++++++++++++ torchvision/transforms/__init__.py | 1 + torchvision/transforms/functional_video.py | 100 ++++++++++++ torchvision/transforms/transforms.py | 34 ++-- torchvision/transforms/transforms_video.py | 171 ++++++++++++++++++++ 6 files changed, 463 insertions(+), 18 deletions(-) create mode 100644 test/test_transforms_video.py create mode 100644 torchvision/transforms/functional_video.py create mode 100644 torchvision/transforms/transforms_video.py diff --git a/test/test_transforms.py b/test/test_transforms.py index 7e8320d6d6c..e4c0759074c 100644 --- a/test/test_transforms.py +++ b/test/test_transforms.py @@ -148,7 +148,7 @@ def test_randomresized_params(self): aspect_min = max(round(random.random(), 2), epsilon) aspect_ratio_range = (aspect_min, aspect_min + round(random.random(), 2)) randresizecrop = transforms.RandomResizedCrop(size, scale_range, aspect_ratio_range) - i, j, h, w = randresizecrop.get_params(img, scale_range, aspect_ratio_range) + i, j, h, w = randresizecrop.get_params(img.size[1], img.size[0], scale_range, aspect_ratio_range) aspect_ratio_obtained = w / h assert (min(aspect_ratio_range) - epsilon <= aspect_ratio_obtained <= max(aspect_ratio_range) + epsilon or aspect_ratio_obtained == 1.0) diff --git a/test/test_transforms_video.py b/test/test_transforms_video.py new file mode 100644 index 00000000000..30370218ddb --- /dev/null +++ b/test/test_transforms_video.py @@ -0,0 +1,173 @@ +from __future__ import division +import torch +import torchvision.transforms as transforms +import unittest +import random +import numpy as np + +try: + from scipy import stats +except ImportError: + stats = None + + +class Tester(unittest.TestCase): + + def test_random_crop_video(self): + numFrames = random.randint(4, 128) + height = random.randint(10, 32) * 2 + width = random.randint(10, 32) * 2 + oheight = random.randint(5, (height - 2) / 2) * 2 + owidth = random.randint(5, (width - 2) / 2) * 2 + clip = torch.ones((numFrames, height, width, 3), dtype=torch.uint8) + result = transforms.Compose([ + transforms.ToTensorVideo(), + transforms.RandomCropVideo((oheight, owidth)), + ])(clip) + assert result.size(2) == oheight + assert result.size(3) == owidth + + transforms.RandomCropVideo((oheight, owidth)).__repr__() + + def test_random_resized_crop_video(self): + numFrames = random.randint(4, 128) + height = random.randint(10, 32) * 2 + width = random.randint(10, 32) * 2 + oheight = random.randint(5, (height - 2) / 2) * 2 + owidth = random.randint(5, (width - 2) / 2) * 2 + clip = torch.ones((numFrames, height, width, 3), dtype=torch.uint8) + result = transforms.Compose([ + transforms.ToTensorVideo(), + transforms.RandomResizedCropVideo((oheight, owidth)), + ])(clip) + assert result.size(2) == oheight + assert result.size(3) == owidth + + transforms.RandomResizedCropVideo((oheight, owidth)).__repr__() + + def test_center_crop_video(self): + numFrames = random.randint(4, 128) + height = random.randint(10, 32) * 2 + width = random.randint(10, 32) * 2 + oheight = random.randint(5, (height - 2) / 2) * 2 + owidth = random.randint(5, (width - 2) / 2) * 2 + + clip = torch.ones([numFrames, height, width, 3], dtype=torch.uint8) + oh1 = (height - oheight) // 2 + ow1 = (width - owidth) // 2 + clipNarrow = clip[:, oh1:oh1 + oheight, ow1:ow1 + owidth, :] + clipNarrow.fill_(0) + result = transforms.Compose([ + transforms.ToTensorVideo(), + transforms.CenterCropVideo((oheight, owidth)), + ])(clip) + + msg = "height: " + str(height) + " width: " \ + + str(width) + " oheight: " + str(oheight) + " owidth: " + str(owidth) + self.assertEqual(result.sum().item(), 0, msg) + + oheight += 1 + owidth += 1 + result = transforms.Compose([ + transforms.ToTensorVideo(), + transforms.CenterCropVideo((oheight, owidth)), + ])(clip) + sum1 = result.sum() + + msg = "height: " + str(height) + " width: " \ + + str(width) + " oheight: " + str(oheight) + " owidth: " + str(owidth) + self.assertEqual(sum1.item() > 1, True, msg) + + oheight += 1 + owidth += 1 + result = transforms.Compose([ + transforms.ToTensorVideo(), + transforms.CenterCropVideo((oheight, owidth)), + ])(clip) + sum2 = result.sum() + + msg = "height: " + str(height) + " width: " \ + + str(width) + " oheight: " + str(oheight) + " owidth: " + str(owidth) + self.assertTrue(sum2.item() > 1, msg) + self.assertTrue(sum2.item() > sum1.item(), msg) + + + @unittest.skipIf(stats is None, 'scipy.stats is not available') + def test_normalize_video(self): + def samples_from_standard_normal(tensor): + p_value = stats.kstest(list(tensor.view(-1)), 'norm', args=(0, 1)).pvalue + return p_value > 0.0001 + + random_state = random.getstate() + random.seed(42) + for channels in [1, 3]: + numFrames = random.randint(4, 128) + height = random.randint(32, 256) + width = random.randint(32, 256) + mean = random.random() + std = random.random() + clip = torch.normal(mean, std, size=(channels, numFrames, height, width)) + mean = [clip[c].mean().item() for c in range(channels)] + std = [clip[c].std().item() for c in range(channels)] + normalized = transforms.NormalizeVideo(mean, std)(clip) + assert samples_from_standard_normal(normalized) + random.setstate(random_state) + + + # Checking the optional in-place behaviour + tensor = torch.rand((3, 128, 16, 16)) + tensor_inplace = transforms.NormalizeVideo((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)(tensor) + assert torch.equal(tensor, tensor_inplace) + + transforms.NormalizeVideo((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True).__repr__() + + def test_to_tensor_video(self): + test_channels = [1, 3, 4] + numFrames, height, width = 64, 4, 4 + trans = transforms.ToTensorVideo() + + with self.assertRaises(TypeError): + trans(np.random.rand(numFrames, height, width, 1).tolist()) + trans(torch.rand((numFrames, height, width, 1), dtype=torch.float)) + + with self.assertRaises(ValueError): + trans(torch.ones((3, numFrames, height, width, 3), dtype=torch.uint8)) + trans(torch.ones((height, width, 3), dtype=torch.uint8)) + trans(torch.ones((width, 3), dtype=torch.uint8)) + trans(torch.ones((3), dtype=torch.uint8)) + + trans.__repr__() + + @unittest.skipIf(stats is None, 'scipy.stats not available') + def test_random_horizontal_flip_video(self): + random_state = random.getstate() + random.seed(42) + clip = torch.rand((3, 4, 112, 112), dtype=torch.float) + hclip = clip.flip((-1)) + + num_samples = 250 + num_horizontal = 0 + for _ in range(num_samples): + out = transforms.RandomHorizontalFlipVideo()(clip) + if torch.all(torch.eq(out, hclip)): + num_horizontal += 1 + + p_value = stats.binom_test(num_horizontal, num_samples, p=0.5) + random.setstate(random_state) + assert p_value > 0.0001 + + num_samples = 250 + num_horizontal = 0 + for _ in range(num_samples): + out = transforms.RandomHorizontalFlipVideo(p=0.7)(clip) + if torch.all(torch.eq(out, hclip)): + num_horizontal += 1 + + p_value = stats.binom_test(num_horizontal, num_samples, p=0.7) + random.setstate(random_state) + assert p_value > 0.0001 + + transforms.RandomHorizontalFlipVideo().__repr__() + +if __name__ == '__main__': + unittest.main() diff --git a/torchvision/transforms/__init__.py b/torchvision/transforms/__init__.py index 7986cdd6429..175a8a8dc1b 100644 --- a/torchvision/transforms/__init__.py +++ b/torchvision/transforms/__init__.py @@ -1 +1,2 @@ from .transforms import * +from .transforms_video import * diff --git a/torchvision/transforms/functional_video.py b/torchvision/transforms/functional_video.py new file mode 100644 index 00000000000..0b4c84d5843 --- /dev/null +++ b/torchvision/transforms/functional_video.py @@ -0,0 +1,100 @@ +import torch + + +def _is_tensor_video_clip(clip): + if not torch.is_tensor(clip): + raise TypeError("clip should be Tesnor. Got %s" % type(clip)) + + if not clip.ndimension() == 4: + raise ValueError("clip should be 4D. Got %dD" % clip.dim()) + + return True + + +def crop(clip, i, j, h, w): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + """ + assert len(clip.size()) == 4, "clip should be a 4D tensor" + return clip[:, :, i : i + h, j : j + w] + + +def resize(clip, target_size, interpolation_mode): + assert len(target_size) == 2, "target size should be tuple (height, width)" + return torch.nn.functional.interpolate( + clip, size=target_size, mode=interpolation_mode + ) + + +def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"): + """ + Do spatial cropping and resizing to the video clip + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + i (int): i in (i,j) i.e coordinates of the upper left corner. + j (int): j in (i,j) i.e coordinates of the upper left corner. + h (int): Height of the cropped region. + w (int): Width of the cropped region. + size (tuple(int, int)): height and width of resized clip + Returns: + clip (torch.tensor): Resized and cropped clip. Size is (C, T, H, W) + """ + assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor" + clip = crop(clip, i, j, h, w) + clip = resize(clip, size, interpolation_mode) + return clip + + +def center_crop(clip, crop_size): + assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor" + h, w = clip.size(2), clip.size(3) + th, tw = crop_size + assert h >= th and w >= tw, "height and width must be no smaller than crop_size" + + i = int(round((h - th) / 2.0)) + j = int(round((w - tw) / 2.0)) + return crop(clip, i, j, th, tw) + + +def to_tensor(clip): + """ + Convert tensor data type to be float and permute the dimenions of clip tensor + Args: + clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C) + Return: + clip (torch.tensor, dtype=torch.float): Size is (C, T, H, W) + """ + _is_tensor_video_clip(clip) + if not clip.dtype == torch.uint8: + raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype)) + return clip.float().permute(3, 0, 1, 2) + + +def normalize(clip, mean, std, inplace=False): + """ + Args: + clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W) + mean (tuple): pixel RGB mean. Size is (3) + std (tuple): pixel standard deviation. Size is (3) + Returns: + normalized clip (torch.tensor): Size is (C, T, H, W) + """ + assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor" + if not inplace: + clip = clip.clone() + mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device) + std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device) + clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None]) + return clip + + +def hflip(clip): + """ + Args: + clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W) + Returns: + flipped clip (torch.tensor): Size is (C, T, H, W) + """ + assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor" + return clip.flip((-1)) diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py index b21a6d86eef..1d6171c1620 100644 --- a/torchvision/transforms/transforms.py +++ b/torchvision/transforms/transforms.py @@ -434,17 +434,17 @@ def __init__(self, size, padding=None, pad_if_needed=False, fill=0, padding_mode self.padding_mode = padding_mode @staticmethod - def get_params(img, output_size): + def get_params(w, h, output_size): """Get parameters for ``crop`` for a random crop. Args: - img (PIL Image): Image to be cropped. + w: width of the image/video + h: height of the image/video output_size (tuple): Expected output size of the crop. Returns: tuple: params (i, j, h, w) to be passed to ``crop`` for random crop. """ - w, h = img.size th, tw = output_size if w == tw and h == th: return 0, 0, h, w @@ -471,7 +471,7 @@ def __call__(self, img): if self.pad_if_needed and img.size[1] < self.size[0]: img = F.pad(img, (0, self.size[0] - img.size[1]), self.fill, self.padding_mode) - i, j, h, w = self.get_params(img, self.size) + i, j, h, w = self.get_params(img.size[0], img.size[1], self.size) return F.crop(img, i, j, h, w) @@ -623,7 +623,7 @@ def __init__(self, size, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.), interpolat self.ratio = ratio @staticmethod - def get_params(img, scale, ratio): + def get_params(height, width, scale, ratio): """Get parameters for ``crop`` for a random sized crop. Args: @@ -635,7 +635,7 @@ def get_params(img, scale, ratio): tuple: params (i, j, h, w) to be passed to ``crop`` for a random sized crop. """ - area = img.size[0] * img.size[1] + area = height * width for attempt in range(10): target_area = random.uniform(*scale) * area @@ -645,24 +645,24 @@ def get_params(img, scale, ratio): w = int(round(math.sqrt(target_area * aspect_ratio))) h = int(round(math.sqrt(target_area / aspect_ratio))) - if 0 < w <= img.size[0] and 0 < h <= img.size[1]: - i = random.randint(0, img.size[1] - h) - j = random.randint(0, img.size[0] - w) + if 0 < w <= width and 0 < h <= height: + i = random.randint(0, height - h) + j = random.randint(0, width - w) return i, j, h, w # Fallback to central crop - in_ratio = img.size[0] / img.size[1] + in_ratio = float(width) / float(height) if (in_ratio < min(ratio)): - w = img.size[0] + w = width h = int(round(w / min(ratio))) elif (in_ratio > max(ratio)): - h = img.size[1] + h = height w = int(round(h * max(ratio))) else: # whole image - w = img.size[0] - h = img.size[1] - i = (img.size[1] - h) // 2 - j = (img.size[0] - w) // 2 + w = width + h = height + i = (height - h) // 2 + j = (width - w) // 2 return i, j, h, w def __call__(self, img): @@ -673,7 +673,7 @@ def __call__(self, img): Returns: PIL Image: Randomly cropped and resized image. """ - i, j, h, w = self.get_params(img, self.scale, self.ratio) + i, j, h, w = self.get_params(img.size[1], img.size[0], self.scale, self.ratio) return F.resized_crop(img, i, j, h, w, self.size, self.interpolation) def __repr__(self): diff --git a/torchvision/transforms/transforms_video.py b/torchvision/transforms/transforms_video.py new file mode 100644 index 00000000000..7da6010f59f --- /dev/null +++ b/torchvision/transforms/transforms_video.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +import math +import numbers +import random + +from torchvision.transforms import ( + RandomCrop, + RandomResizedCrop, +) + +from . import functional_video as F + + +__all__ = [ + "RandomCropVideo", + "RandomResizedCropVideo", + "CenterCropVideo", + "NormalizeVideo", + "ToTensorVideo", + "RandomHorizontalFlipVideo", +] + + +class RandomCropVideo(RandomCrop): + def __init__(self, size): + if isinstance(size, numbers.Number): + self.size = (int(size), int(size)) + else: + self.size = size + + def __call__(self, clip): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + Returns: + torch.tensor: randomly cropped/resized video clip. + size is (C, T, OH, OW) + """ + i, j, h, w = self.get_params(clip.size(3), clip.size(2), self.size) + return F.crop(clip, i, j, h, w) + + def __repr__(self): + return self.__class__.__name__ + '(size={0})'.format(self.size) + + +class RandomResizedCropVideo(RandomResizedCrop): + def __init__( + self, + size, + scale=(0.08, 1.0), + ratio=(3.0 / 4.0, 4.0 / 3.0), + interpolation_mode="bilinear", + ): + if isinstance(size, tuple): + assert len(size) == 2, "size should be tuple (height, width)" + self.size = size + else: + self.size = (size, size) + + self.interpolation_mode = interpolation_mode + self.scale = scale + self.ratio = ratio + + def __call__(self, clip): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + Returns: + torch.tensor: randomly cropped/resized video clip. + size is (C, T, H, W) + """ + i, j, h, w = self.get_params(clip.size(2), clip.size(3), self.scale, self.ratio) + return F.resized_crop(clip, i, j, h, w, self.size, self.interpolation_mode) + + def __repr__(self): + return self.__class__.__name__ + '(size={0}, interpolation_mode={1}, scale={2}, ratio={3})'.format( + self.size, self.interpolation_mode, self.scale, self.ratio) + + + +class CenterCropVideo(object): + def __init__(self, crop_size): + if isinstance(crop_size, numbers.Number): + self.crop_size = (int(size), int(size)) + else: + self.crop_size = crop_size + + def __call__(self, clip): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + Returns: + torch.tensor: central cropping of video clip. Size is (C, T, crop_size, crop_size) + """ + return F.center_crop(clip, self.crop_size) + + def __repr__(self): + return self.__class__.__name__ + '(crop_size={0})'.format(self.crop_size) + +class NormalizeVideo(object): + """ + Normalize the video clip by mean subtraction and division by standard deviation + Args: + mean (3-tuple): pixel RGB mean + std (3-tuple): pixel RGB standard deviation + inplace (boolean): whether do in-place normalization + """ + + def __init__(self, mean, std, inplace=False): + self.mean = mean + self.std = std + self.inplace = inplace + + def __call__(self, clip): + """ + Args: + clip (torch.tensor): video clip to be normalized. Size is (C, T, H, W) + """ + return F.normalize(clip, self.mean, self.std, self.inplace) + + def __repr__(self): + return self.__class__.__name__ + '(mean={0}, std={1}, inplace={2})'.format( + self.mean, self.std, self.inplace) + + +class ToTensorVideo(object): + """ + Convert tensor data type to be float and permute the dimenions of clip tensor + """ + + def __init__(self): + pass + + def __call__(self, clip): + """ + Convert tensor data type to be float and permute the dimenions of clip tensor + Args: + clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C) + Return: + clip (torch.tensor, dtype=torch.float): Size is (C, T, H, W) + """ + return F.to_tensor(clip) + + def __repr__(self): + return self.__class__.__name__ + +class RandomHorizontalFlipVideo(object): + """ + Flip the video clip along the horizonal direction with a given probability + Args: + p (float): probability of the clip being flipped. Default value is 0.5 + """ + + def __init__(self, p=0.5): + self.p = p + + def __call__(self, clip): + """ + Convert tensor data type to be float and permute the dimenions of clip tensor + Args: + clip (torch.tensor): Size is (C, T, H, W) + Return: + clip (torch.tensor): Size is (C, T, H, W) + """ + if random.random() < self.p: + clip = F.hflip(clip) + return clip + + def __repr__(self): + return self.__class__.__name__ + "(p={0})".format(self.p) From 19f57059e462ff06509f0ccb9aee8103dbcdf9db Mon Sep 17 00:00:00 2001 From: zyan3 Date: Sat, 7 Sep 2019 22:09:24 -0700 Subject: [PATCH 2/5] [video transforms]in ToTensorVideo, divide value by 255.0 --- test/test_transforms_video.py | 6 +++--- torchvision/transforms/functional_video.py | 5 +++-- torchvision/transforms/transforms_video.py | 5 ++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/test_transforms_video.py b/test/test_transforms_video.py index 30370218ddb..5028ef675f3 100644 --- a/test/test_transforms_video.py +++ b/test/test_transforms_video.py @@ -19,7 +19,7 @@ def test_random_crop_video(self): width = random.randint(10, 32) * 2 oheight = random.randint(5, (height - 2) / 2) * 2 owidth = random.randint(5, (width - 2) / 2) * 2 - clip = torch.ones((numFrames, height, width, 3), dtype=torch.uint8) + clip = torch.randint(0, 256, (numFrames, height, width, 3), dtype=torch.uint8) result = transforms.Compose([ transforms.ToTensorVideo(), transforms.RandomCropVideo((oheight, owidth)), @@ -35,7 +35,7 @@ def test_random_resized_crop_video(self): width = random.randint(10, 32) * 2 oheight = random.randint(5, (height - 2) / 2) * 2 owidth = random.randint(5, (width - 2) / 2) * 2 - clip = torch.ones((numFrames, height, width, 3), dtype=torch.uint8) + clip = torch.randint(0, 256, (numFrames, height, width, 3), dtype=torch.uint8) result = transforms.Compose([ transforms.ToTensorVideo(), transforms.RandomResizedCropVideo((oheight, owidth)), @@ -52,7 +52,7 @@ def test_center_crop_video(self): oheight = random.randint(5, (height - 2) / 2) * 2 owidth = random.randint(5, (width - 2) / 2) * 2 - clip = torch.ones([numFrames, height, width, 3], dtype=torch.uint8) + clip = torch.ones((numFrames, height, width, 3), dtype=torch.uint8) * 255 oh1 = (height - oheight) // 2 ow1 = (width - owidth) // 2 clipNarrow = clip[:, oh1:oh1 + oheight, ow1:ow1 + owidth, :] diff --git a/torchvision/transforms/functional_video.py b/torchvision/transforms/functional_video.py index 0b4c84d5843..627c0e3b0f1 100644 --- a/torchvision/transforms/functional_video.py +++ b/torchvision/transforms/functional_video.py @@ -59,7 +59,8 @@ def center_crop(clip, crop_size): def to_tensor(clip): """ - Convert tensor data type to be float and permute the dimenions of clip tensor + Convert tensor data type from uint8 to float, divide value by 255.0 and + permute the dimenions of clip tensor Args: clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C) Return: @@ -68,7 +69,7 @@ def to_tensor(clip): _is_tensor_video_clip(clip) if not clip.dtype == torch.uint8: raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype)) - return clip.float().permute(3, 0, 1, 2) + return clip.float().permute(3, 0, 1, 2) / 255.0 def normalize(clip, mean, std, inplace=False): diff --git a/torchvision/transforms/transforms_video.py b/torchvision/transforms/transforms_video.py index 7da6010f59f..50c292e30d1 100644 --- a/torchvision/transforms/transforms_video.py +++ b/torchvision/transforms/transforms_video.py @@ -126,7 +126,8 @@ def __repr__(self): class ToTensorVideo(object): """ - Convert tensor data type to be float and permute the dimenions of clip tensor + Convert tensor data type from uint8 to float, divide value by 255.0 and + permute the dimenions of clip tensor """ def __init__(self): @@ -134,7 +135,6 @@ def __init__(self): def __call__(self, clip): """ - Convert tensor data type to be float and permute the dimenions of clip tensor Args: clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C) Return: @@ -157,7 +157,6 @@ def __init__(self, p=0.5): def __call__(self, clip): """ - Convert tensor data type to be float and permute the dimenions of clip tensor Args: clip (torch.tensor): Size is (C, T, H, W) Return: From 717aebaf8cc007105c9dbca70940460dc7023df7 Mon Sep 17 00:00:00 2001 From: zyan3 Date: Sat, 7 Sep 2019 22:39:31 -0700 Subject: [PATCH 3/5] [video transforms] fix a bug --- torchvision/transforms/transforms_video.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/transforms/transforms_video.py b/torchvision/transforms/transforms_video.py index 50c292e30d1..bab4811b938 100644 --- a/torchvision/transforms/transforms_video.py +++ b/torchvision/transforms/transforms_video.py @@ -82,7 +82,7 @@ def __repr__(self): class CenterCropVideo(object): def __init__(self, crop_size): if isinstance(crop_size, numbers.Number): - self.crop_size = (int(size), int(size)) + self.crop_size = (int(crop_size), int(crop_size)) else: self.crop_size = crop_size From 84cb0c9d6d9a32b8671e3548fbf07205714597de Mon Sep 17 00:00:00 2001 From: zyan3 Date: Thu, 19 Sep 2019 14:42:10 -0700 Subject: [PATCH 4/5] fix linting --- test/test_transforms_video.py | 4 +--- torchvision/transforms/functional_video.py | 2 +- torchvision/transforms/transforms_video.py | 13 ++++++++----- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/test/test_transforms_video.py b/test/test_transforms_video.py index 5028ef675f3..b0a237e9318 100644 --- a/test/test_transforms_video.py +++ b/test/test_transforms_video.py @@ -91,7 +91,6 @@ def test_center_crop_video(self): self.assertTrue(sum2.item() > 1, msg) self.assertTrue(sum2.item() > sum1.item(), msg) - @unittest.skipIf(stats is None, 'scipy.stats is not available') def test_normalize_video(self): def samples_from_standard_normal(tensor): @@ -113,7 +112,6 @@ def samples_from_standard_normal(tensor): assert samples_from_standard_normal(normalized) random.setstate(random_state) - # Checking the optional in-place behaviour tensor = torch.rand((3, 128, 16, 16)) tensor_inplace = transforms.NormalizeVideo((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)(tensor) @@ -122,7 +120,6 @@ def samples_from_standard_normal(tensor): transforms.NormalizeVideo((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True).__repr__() def test_to_tensor_video(self): - test_channels = [1, 3, 4] numFrames, height, width = 64, 4, 4 trans = transforms.ToTensorVideo() @@ -169,5 +166,6 @@ def test_random_horizontal_flip_video(self): transforms.RandomHorizontalFlipVideo().__repr__() + if __name__ == '__main__': unittest.main() diff --git a/torchvision/transforms/functional_video.py b/torchvision/transforms/functional_video.py index 627c0e3b0f1..ee7c07c8c6f 100644 --- a/torchvision/transforms/functional_video.py +++ b/torchvision/transforms/functional_video.py @@ -17,7 +17,7 @@ def crop(clip, i, j, h, w): clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) """ assert len(clip.size()) == 4, "clip should be a 4D tensor" - return clip[:, :, i : i + h, j : j + w] + return clip[:, :, i:i + h, j:j + w] def resize(clip, target_size, interpolation_mode): diff --git a/torchvision/transforms/transforms_video.py b/torchvision/transforms/transforms_video.py index bab4811b938..28e1c9a7e23 100644 --- a/torchvision/transforms/transforms_video.py +++ b/torchvision/transforms/transforms_video.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -import math import numbers import random @@ -74,9 +73,10 @@ def __call__(self, clip): return F.resized_crop(clip, i, j, h, w, self.size, self.interpolation_mode) def __repr__(self): - return self.__class__.__name__ + '(size={0}, interpolation_mode={1}, scale={2}, ratio={3})'.format( - self.size, self.interpolation_mode, self.scale, self.ratio) - + return self.__class__.__name__ + \ + '(size={0}, interpolation_mode={1}, scale={2}, ratio={3})'.format( + self.size, self.interpolation_mode, self.scale, self.ratio + ) class CenterCropVideo(object): @@ -91,13 +91,15 @@ def __call__(self, clip): Args: clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) Returns: - torch.tensor: central cropping of video clip. Size is (C, T, crop_size, crop_size) + torch.tensor: central cropping of video clip. Size is + (C, T, crop_size, crop_size) """ return F.center_crop(clip, self.crop_size) def __repr__(self): return self.__class__.__name__ + '(crop_size={0})'.format(self.crop_size) + class NormalizeVideo(object): """ Normalize the video clip by mean subtraction and division by standard deviation @@ -145,6 +147,7 @@ def __call__(self, clip): def __repr__(self): return self.__class__.__name__ + class RandomHorizontalFlipVideo(object): """ Flip the video clip along the horizonal direction with a given probability From 161848bd874380107d5085707aecb9116c7874a4 Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Tue, 24 Sep 2019 10:25:11 -0300 Subject: [PATCH 5/5] Make changes backwards-compatible --- test/test_transforms.py | 2 +- torchvision/transforms/functional_video.py | 4 ++-- torchvision/transforms/transforms.py | 22 ++++++++++++++++------ torchvision/transforms/transforms_video.py | 4 ++-- 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/test/test_transforms.py b/test/test_transforms.py index e4c0759074c..7e8320d6d6c 100644 --- a/test/test_transforms.py +++ b/test/test_transforms.py @@ -148,7 +148,7 @@ def test_randomresized_params(self): aspect_min = max(round(random.random(), 2), epsilon) aspect_ratio_range = (aspect_min, aspect_min + round(random.random(), 2)) randresizecrop = transforms.RandomResizedCrop(size, scale_range, aspect_ratio_range) - i, j, h, w = randresizecrop.get_params(img.size[1], img.size[0], scale_range, aspect_ratio_range) + i, j, h, w = randresizecrop.get_params(img, scale_range, aspect_ratio_range) aspect_ratio_obtained = w / h assert (min(aspect_ratio_range) - epsilon <= aspect_ratio_obtained <= max(aspect_ratio_range) + epsilon or aspect_ratio_obtained == 1.0) diff --git a/torchvision/transforms/functional_video.py b/torchvision/transforms/functional_video.py index ee7c07c8c6f..06c30716908 100644 --- a/torchvision/transforms/functional_video.py +++ b/torchvision/transforms/functional_video.py @@ -17,7 +17,7 @@ def crop(clip, i, j, h, w): clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) """ assert len(clip.size()) == 4, "clip should be a 4D tensor" - return clip[:, :, i:i + h, j:j + w] + return clip[..., i:i + h, j:j + w] def resize(clip, target_size, interpolation_mode): @@ -48,7 +48,7 @@ def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"): def center_crop(clip, crop_size): assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor" - h, w = clip.size(2), clip.size(3) + h, w = clip.size(-2), clip.size(-1) th, tw = crop_size assert h >= th and w >= tw, "height and width must be no smaller than crop_size" diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py index 1d6171c1620..3ec84aae84c 100644 --- a/torchvision/transforms/transforms.py +++ b/torchvision/transforms/transforms.py @@ -40,6 +40,15 @@ } +def _get_image_size(img): + if F._is_pil_image(img): + return img.size + elif isinstance(img, torch.Tensor) and img.dim() > 2: + return img.shape[-2:][::-1] + else: + raise TypeError("Unexpected type {}".format(type(img))) + + class Compose(object): """Composes several transforms together. @@ -434,17 +443,17 @@ def __init__(self, size, padding=None, pad_if_needed=False, fill=0, padding_mode self.padding_mode = padding_mode @staticmethod - def get_params(w, h, output_size): + def get_params(img, output_size): """Get parameters for ``crop`` for a random crop. Args: - w: width of the image/video - h: height of the image/video + img (PIL Image): Image to be cropped. output_size (tuple): Expected output size of the crop. Returns: tuple: params (i, j, h, w) to be passed to ``crop`` for random crop. """ + w, h = _get_image_size(img) th, tw = output_size if w == tw and h == th: return 0, 0, h, w @@ -471,7 +480,7 @@ def __call__(self, img): if self.pad_if_needed and img.size[1] < self.size[0]: img = F.pad(img, (0, self.size[0] - img.size[1]), self.fill, self.padding_mode) - i, j, h, w = self.get_params(img.size[0], img.size[1], self.size) + i, j, h, w = self.get_params(img, self.size) return F.crop(img, i, j, h, w) @@ -623,7 +632,7 @@ def __init__(self, size, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.), interpolat self.ratio = ratio @staticmethod - def get_params(height, width, scale, ratio): + def get_params(img, scale, ratio): """Get parameters for ``crop`` for a random sized crop. Args: @@ -635,6 +644,7 @@ def get_params(height, width, scale, ratio): tuple: params (i, j, h, w) to be passed to ``crop`` for a random sized crop. """ + width, height = _get_image_size(img) area = height * width for attempt in range(10): @@ -673,7 +683,7 @@ def __call__(self, img): Returns: PIL Image: Randomly cropped and resized image. """ - i, j, h, w = self.get_params(img.size[1], img.size[0], self.scale, self.ratio) + i, j, h, w = self.get_params(img, self.scale, self.ratio) return F.resized_crop(img, i, j, h, w, self.size, self.interpolation) def __repr__(self): diff --git a/torchvision/transforms/transforms_video.py b/torchvision/transforms/transforms_video.py index 28e1c9a7e23..e11d8489eb8 100644 --- a/torchvision/transforms/transforms_video.py +++ b/torchvision/transforms/transforms_video.py @@ -36,7 +36,7 @@ def __call__(self, clip): torch.tensor: randomly cropped/resized video clip. size is (C, T, OH, OW) """ - i, j, h, w = self.get_params(clip.size(3), clip.size(2), self.size) + i, j, h, w = self.get_params(clip, self.size) return F.crop(clip, i, j, h, w) def __repr__(self): @@ -69,7 +69,7 @@ def __call__(self, clip): torch.tensor: randomly cropped/resized video clip. size is (C, T, H, W) """ - i, j, h, w = self.get_params(clip.size(2), clip.size(3), self.scale, self.ratio) + i, j, h, w = self.get_params(clip, self.scale, self.ratio) return F.resized_crop(clip, i, j, h, w, self.size, self.interpolation_mode) def __repr__(self):