new video data augmentation transform (#53)

stephenyan1231 · facebook-github-bot · commit 79c4194ef164 · 2019-10-16T21:37:00.000-07:00
Summary: Pull Request resolved: #53 - We add video transforms in TorchVision: pytorch/vision#1306 - In ClassyVision, we add default transforms for training / test stage. Alternatively, user can also explicitly provide transform config in json config input. See an example in the unit test. - Video data transforms supports audio modality in the video dataset. - Compared with image transforms which only returns a torch.Tensor, video transforms return a dict where key is the modality name (e.g. {"video", "audio"}) and value is a torch.Tensor for the modality data. Reviewed By: taylorgordon20 Differential Revision: D16999453 fbshipit-source-id: 112b66a3965cba4201bbb12c99f3fdd2f1fce86f
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -16,9 +16,9 @@ jobs:
       # Download and cache dependencies
       - restore_cache:
           keys:
-            - v2-dependencies-{{ checksum "requirements.txt" }}
+            - v3-dependencies-{{ checksum "requirements.txt" }}
             # fallback to using the latest cache if no exact match is found
-            - v2-dependencies-
+            - v3-dependencies-
 
       - run:
           name: install dependencies
@@ -31,7 +31,7 @@ jobs:
       - save_cache:
           paths:
             - ./venv
-          key: v2-dependencies-{{ checksum "requirements.txt" }}
+          key: v3-dependencies-{{ checksum "requirements.txt" }}
 
       - run:
           name: run tests
diff --git a/classy_vision/dataset/core/__init__.py b/classy_vision/dataset/core/__init__.py
@@ -8,10 +8,12 @@
 from .dataset import Dataset
 from .list_dataset import ListDataset
 from .random_image_datasets import RandomImageBinaryClassDataset, RandomImageDataset
+from .random_video_datasets import RandomVideoDataset
 from .resample_dataset import ResampleDataset
 from .shuffle_dataset import ShuffleDataset
 from .transform_dataset import TransformDataset
 from .wrap_dataset import WrapDataset
+from .wrap_torchvision_video_dataset import WrapTorchVisionVideoDataset
 
 
 # TODO: Fix this:
@@ -23,8 +25,10 @@
     "ListDataset",
     "RandomImageBinaryClassDataset",
     "RandomImageDataset",
+    "RandomVideoDataset",
     "ResampleDataset",
     "ShuffleDataset",
     "TransformDataset",
     "WrapDataset",
+    "WrapTorchVisionVideoDataset",
 ]
diff --git a/classy_vision/dataset/core/random_video_datasets.py b/classy_vision/dataset/core/random_video_datasets.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from ...generic.util import torch_seed
+from .dataset import Dataset
+
+
+class RandomVideoDataset(Dataset):
+    def __init__(
+        self,
+        num_classes,
+        split,
+        num_samples,
+        frames_per_clip,
+        video_width,
+        video_height,
+        audio_samples,
+        clips_per_video,
+        seed=10,
+    ):
+        self.num_classes = num_classes
+        self.split = split
+        # video config
+        self.video_channels = 3
+        self.num_samples = num_samples
+        self.frames_per_clip = frames_per_clip
+        self.video_width = video_width
+        self.video_height = video_height
+        # audio config
+        self.audio_samples = audio_samples
+        self.clips_per_video = clips_per_video
+        # misc config
+        self.seed = seed
+
+    def __getitem__(self, idx):
+        if self.split == "train":
+            # assume we only sample 1 clip from each training video
+            target_seed_offset = idx
+        else:
+            # for video model testing, clips from the same video share the same
+            # target label
+            target_seed_offset = idx // self.clips_per_video
+        with torch_seed(self.seed + target_seed_offset):
+            target = torch.randint(0, self.num_classes, (1,)).item()
+
+        with torch_seed(self.seed + idx):
+            return {
+                "input": {
+                    "video": torch.randint(
+                        0,
+                        256,
+                        (
+                            self.frames_per_clip,
+                            self.video_height,
+                            self.video_width,
+                            self.video_channels,
+                        ),
+                        dtype=torch.uint8,
+                    ),
+                    "audio": torch.rand((self.audio_samples, 1), dtype=torch.float),
+                },
+                "target": target,
+            }
+
+    def __len__(self):
+        return self.num_samples
diff --git a/classy_vision/dataset/core/wrap_torchvision_video_dataset.py b/classy_vision/dataset/core/wrap_torchvision_video_dataset.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .dataset import Dataset
+
+
+class WrapTorchVisionVideoDataset(Dataset):
+    """
+        Wraps a TorchVision video dataset into our core dataset interface.
+        A video dataset can contain both video and audio data
+    """
+
+    def __init__(self, dataset):
+        import torch.utils.data
+
+        assert isinstance(dataset, torch.utils.data.Dataset)
+        super(WrapTorchVisionVideoDataset, self).__init__()
+        self.dataset = dataset
+
+    def __getitem__(self, idx):
+        video, audio, target = self.dataset[idx]
+        return {"input": {"video": video, "audio": audio}, "target": target}
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def get_classy_state(self):
+        # Pytorch datasets don't have state
+        return {
+            # For debugging saved states
+            "state": {"dataset_type": type(self)}
+        }
diff --git a/classy_vision/dataset/transforms/__init__.py b/classy_vision/dataset/transforms/__init__.py
@@ -9,6 +9,7 @@
 from typing import Any, Callable, Dict, List
 
 import torchvision.transforms as transforms
+import torchvision.transforms._transforms_video as transforms_video
 from classy_vision.generic.registry_utils import import_all_modules
 
 from .classy_transform import ClassyTransform
@@ -34,11 +35,14 @@ def build_transform(transform_config: Dict[str, Any]) -> Callable:
     if name in TRANSFORM_REGISTRY:
         return TRANSFORM_REGISTRY[name].from_config(transform_args)
     # the name should be available in torchvision.transforms
-    assert hasattr(transforms, name), (
+    assert hasattr(transforms, name) or hasattr(transforms_video, name), (
         f"{name} isn't a registered tranform"
         ", nor is it available in torchvision.transforms"
     )
-    return getattr(transforms, name)(**transform_args)
+    if hasattr(transforms, name):
+        return getattr(transforms, name)(**transform_args)
+    else:
+        return getattr(transforms_video, name)(**transform_args)
 
 
 def build_transforms(transforms_config: List[Dict[str, Any]]) -> Callable:
diff --git a/classy_vision/dataset/transforms/util_video.py b/classy_vision/dataset/transforms/util_video.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Callable, Dict, List, Optional
+
+import torch
+import torchvision.transforms as transforms
+import torchvision.transforms._transforms_video as transforms_video
+
+from . import ClassyTransform, build_transforms, register_transform
+from .util import FieldTransform, ImagenetConstants
+
+
+class VideoConstants:
+    """use the same mean/std from image classification to enable the parameter
+    inflation where parameters of 2D conv in image model can be inflated into
+    3D conv in video model"""
+
+    MEAN = ImagenetConstants.MEAN
+    STD = ImagenetConstants.STD
+    CROP_SIZE = 112
+
+
+@register_transform("video_default_augment")
+class VideoDefaultAugmentTransform(ClassyTransform):
+    def __init__(
+        self,
+        crop_size: int = VideoConstants.CROP_SIZE,
+        mean: List[float] = VideoConstants.MEAN,
+        std: List[float] = VideoConstants.STD,
+    ):
+        self._transform = transforms.Compose(
+            [
+                transforms_video.ToTensorVideo(),
+                transforms_video.RandomResizedCropVideo(crop_size),
+                transforms_video.RandomHorizontalFlipVideo(),
+                transforms_video.NormalizeVideo(mean=mean, std=std),
+            ]
+        )
+
+    def __call__(self, video):
+        return self._transform(video)
+
+
+@register_transform("video_default_no_augment")
+class VideoDefaultNoAugmentTransform(ClassyTransform):
+    def __init__(
+        self,
+        mean: List[float] = VideoConstants.MEAN,
+        std: List[float] = VideoConstants.STD,
+    ):
+        self._transform = transforms.Compose(
+            # At testing stage, central cropping is not used because we
+            # conduct fully convolutional-style testing
+            [
+                transforms_video.ToTensorVideo(),
+                transforms_video.NormalizeVideo(mean=mean, std=std),
+            ]
+        )
+
+    def __call__(self, video):
+        return self._transform(video)
+
+
+@register_transform("dummy_audio_transform")
+class DummyAudioTransform(ClassyTransform):
+    """
+    A dummy audio transform. It ignores actual audio data, and returns an empty tensor.
+    It is useful when actual audio data is raw waveform and has a varying number of
+    waveform samples which makes minibatch assembling impossible
+    """
+
+    def __init__(self):
+        pass
+
+    def __call__(self, _audio):
+        return torch.zeros(0, 1, dtype=torch.float)
+
+
+class ClassyVideoGenericTransform(object):
+    def __init__(
+        self,
+        config: Optional[Dict[str, List[Dict[str, Any]]]] = None,
+        split: str = "train",
+    ):
+        self.transforms = {
+            "video": VideoDefaultAugmentTransform()
+            if split == "train"
+            else VideoDefaultNoAugmentTransform(),
+            "audio": DummyAudioTransform(),
+        }
+        if config is not None:
+            for mode, modal_config in config.items():
+                assert mode in ["video", "audio"], (
+                    "unknown video data modality %s" % mode
+                )
+                self.transforms[mode] = build_transforms(modal_config)
+
+    def __call__(self, video):
+        assert isinstance(video, dict), "video data is expected be a dict"
+        for mode, modal_data in video.items():
+            if mode in self.transforms:
+                video[mode] = self.transforms[mode](modal_data)
+        return video
+
+
+def build_video_field_transform_default(
+    config: Optional[Dict[str, List[Dict[str, Any]]]],
+    split: str = "train",
+    key: str = "input",
+) -> Callable:
+    """
+    Returns a FieldTransform which applies a transform on the specified key.
+
+    """
+    transform = ClassyVideoGenericTransform(config, split)
+    return FieldTransform(transform, key=key)
diff --git a/test/dataset_transforms_util_video_test.py b/test/dataset_transforms_util_video_test.py