From 9dc43fb6328ef63d8cb0c4ff1efe74f26fd31f34 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Sun, 11 Oct 2020 21:27:40 +0000 Subject: [PATCH 1/3] [WIP] Update ref example video classification --- references/video_classification/README.md | 9 +++++++++ references/video_classification/train.py | 3 ++- 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 references/video_classification/README.md diff --git a/references/video_classification/README.md b/references/video_classification/README.md new file mode 100644 index 00000000000..8a1e4b7d8bd --- /dev/null +++ b/references/video_classification/README.md @@ -0,0 +1,9 @@ +# Video Classification + +TODO: Add some info about the context, dataset we use etc + +## Training + +```bash +cmd to execute +``` diff --git a/references/video_classification/train.py b/references/video_classification/train.py index e71c03f174f..67d94d5ff66 100644 --- a/references/video_classification/train.py +++ b/references/video_classification/train.py @@ -7,12 +7,13 @@ from torch import nn import torchvision import torchvision.datasets.video_utils -from torchvision import transforms +# from torchvision import transforms as T from torchvision.datasets.samplers import DistributedSampler, UniformClipSampler, RandomClipSampler import utils from scheduler import WarmupMultiStepLR +# from transforms import ToFloatTensorInZeroOne import transforms as T try: From 731a075b352894e21fbee7762c078a2aed2c8674 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Thu, 29 Oct 2020 15:09:55 +0000 Subject: [PATCH 2/3] [WIP] Updated video classification ref example --- references/video_classification/README.md | 26 +++- references/video_classification/train.py | 25 ++-- references/video_classification/transforms.py | 128 ++---------------- 3 files changed, 48 insertions(+), 131 deletions(-) diff --git a/references/video_classification/README.md b/references/video_classification/README.md index 8a1e4b7d8bd..e4016369e89 100644 --- a/references/video_classification/README.md +++ b/references/video_classification/README.md @@ -2,8 +2,32 @@ TODO: Add some info about the context, dataset we use etc +## Data preparation + +If you already have downloaded [Kinetics400 dataset](https://deepmind.com/research/open-source/kinetics), +please proceed directly to the next section. + +To download videos, one can use https://github.com/Showmax/kinetics-downloader + ## Training +We assume the training and validation AVI videos are stored at `/data/kinectics400/train` and +`/data/kinectics400/val`. + +### Multiple GPUs + +Run the training on a single node with 8 GPUs: ```bash -cmd to execute +python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=16 --cache-dataset --sync-bn --apex ``` + +### Single GPU + +**Note:** training on a single gpu can be extremely slow. + + +```bash +python train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=8 --cache-dataset +``` + + diff --git a/references/video_classification/train.py b/references/video_classification/train.py index 67d94d5ff66..6cef08188af 100644 --- a/references/video_classification/train.py +++ b/references/video_classification/train.py @@ -7,14 +7,13 @@ from torch import nn import torchvision import torchvision.datasets.video_utils -# from torchvision import transforms as T +from torchvision import transforms as T from torchvision.datasets.samplers import DistributedSampler, UniformClipSampler, RandomClipSampler import utils from scheduler import WarmupMultiStepLR -# from transforms import ToFloatTensorInZeroOne -import transforms as T +from transforms import bhwc_to_bchw, bchw_to_cbhw try: from apex import amp @@ -120,11 +119,13 @@ def main(args): st = time.time() cache_path = _get_cache_path(traindir) transform_train = torchvision.transforms.Compose([ - T.ToFloatTensorInZeroOne(), + bhwc_to_bchw, + T.ConvertImageDtype(torch.float32), T.Resize((128, 171)), T.RandomHorizontalFlip(), normalize, - T.RandomCrop((112, 112)) + T.RandomCrop((112, 112)), + bchw_to_cbhw ]) if args.cache_dataset and os.path.exists(cache_path): @@ -140,7 +141,8 @@ def main(args): frames_per_clip=args.clip_len, step_between_clips=1, transform=transform_train, - frame_rate=15 + frame_rate=15, + extensions=('avi', 'mp4', ) ) if args.cache_dataset: print("Saving dataset_train to {}".format(cache_path)) @@ -153,10 +155,12 @@ def main(args): cache_path = _get_cache_path(valdir) transform_test = torchvision.transforms.Compose([ - T.ToFloatTensorInZeroOne(), + bhwc_to_bchw, + T.ConvertImageDtype(torch.float32), T.Resize((128, 171)), normalize, - T.CenterCrop((112, 112)) + T.CenterCrop((112, 112)), + bchw_to_cbhw ]) if args.cache_dataset and os.path.exists(cache_path): @@ -172,7 +176,8 @@ def main(args): frames_per_clip=args.clip_len, step_between_clips=1, transform=transform_test, - frame_rate=15 + frame_rate=15, + extensions=('avi', 'mp4',) ) if args.cache_dataset: print("Saving dataset_test to {}".format(cache_path)) @@ -266,7 +271,7 @@ def main(args): def parse_args(): import argparse - parser = argparse.ArgumentParser(description='PyTorch Classification Training') + parser = argparse.ArgumentParser(description='PyTorch Video Classification Training') parser.add_argument('--data-path', default='/datasets01_101/kinetics/070618/', help='dataset') parser.add_argument('--train-dir', default='train_avi-480p', help='name of train dir') diff --git a/references/video_classification/transforms.py b/references/video_classification/transforms.py index 9435450c4b3..09d0185d4e7 100644 --- a/references/video_classification/transforms.py +++ b/references/video_classification/transforms.py @@ -1,122 +1,10 @@ -import torch -import random +def bhwc_to_bchw(vid): + """Convert tensor from (B, H, W, C) to (B, C, H, W) + """ + return vid.permute(0, 3, 1, 2) -def crop(vid, i, j, h, w): - return vid[..., i:(i + h), j:(j + w)] - - -def center_crop(vid, output_size): - h, w = vid.shape[-2:] - th, tw = output_size - - i = int(round((h - th) / 2.)) - j = int(round((w - tw) / 2.)) - return crop(vid, i, j, th, tw) - - -def hflip(vid): - return vid.flip(dims=(-1,)) - - -# NOTE: for those functions, which generally expect mini-batches, we keep them -# as non-minibatch so that they are applied as if they were 4d (thus image). -# this way, we only apply the transformation in the spatial domain -def resize(vid, size, interpolation='bilinear'): - # NOTE: using bilinear interpolation because we don't work on minibatches - # at this level - scale = None - if isinstance(size, int): - scale = float(size) / min(vid.shape[-2:]) - size = None - return torch.nn.functional.interpolate( - vid, size=size, scale_factor=scale, mode=interpolation, align_corners=False) - - -def pad(vid, padding, fill=0, padding_mode="constant"): - # NOTE: don't want to pad on temporal dimension, so let as non-batch - # (4d) before padding. This works as expected - return torch.nn.functional.pad(vid, padding, value=fill, mode=padding_mode) - - -def to_normalized_float_tensor(vid): - return vid.permute(3, 0, 1, 2).to(torch.float32) / 255 - - -def normalize(vid, mean, std): - shape = (-1,) + (1,) * (vid.dim() - 1) - mean = torch.as_tensor(mean).reshape(shape) - std = torch.as_tensor(std).reshape(shape) - return (vid - mean) / std - - -# Class interface - -class RandomCrop(object): - def __init__(self, size): - self.size = size - - @staticmethod - def get_params(vid, output_size): - """Get parameters for ``crop`` for a random crop. - """ - h, w = vid.shape[-2:] - th, tw = output_size - if w == tw and h == th: - return 0, 0, h, w - i = random.randint(0, h - th) - j = random.randint(0, w - tw) - return i, j, th, tw - - def __call__(self, vid): - i, j, h, w = self.get_params(vid, self.size) - return crop(vid, i, j, h, w) - - -class CenterCrop(object): - def __init__(self, size): - self.size = size - - def __call__(self, vid): - return center_crop(vid, self.size) - - -class Resize(object): - def __init__(self, size): - self.size = size - - def __call__(self, vid): - return resize(vid, self.size) - - -class ToFloatTensorInZeroOne(object): - def __call__(self, vid): - return to_normalized_float_tensor(vid) - - -class Normalize(object): - def __init__(self, mean, std): - self.mean = mean - self.std = std - - def __call__(self, vid): - return normalize(vid, self.mean, self.std) - - -class RandomHorizontalFlip(object): - def __init__(self, p=0.5): - self.p = p - - def __call__(self, vid): - if random.random() < self.p: - return hflip(vid) - return vid - - -class Pad(object): - def __init__(self, padding, fill=0): - self.padding = padding - self.fill = fill - - def __call__(self, vid): - return pad(vid, self.padding, self.fill) +def bchw_to_cbhw(vid): + """Convert tensor from (B, C, H, W) to (C, B, H, W) + """ + return vid.permute(1, 0, 2, 3) From 7f48de1adb1d4a4785d5c12ec891faac2a161a80 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Mon, 2 Nov 2020 09:37:14 +0000 Subject: [PATCH 3/3] Replaced mem format conversion functions by classes --- references/video_classification/README.md | 2 ++ references/video_classification/train.py | 10 +++++----- references/video_classification/transforms.py | 16 ++++++++++++---- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/references/video_classification/README.md b/references/video_classification/README.md index e4016369e89..525cfddd414 100644 --- a/references/video_classification/README.md +++ b/references/video_classification/README.md @@ -21,6 +21,8 @@ Run the training on a single node with 8 GPUs: python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py --data-path=/data/kinectics400 --train-dir=train --val-dir=val --batch-size=16 --cache-dataset --sync-bn --apex ``` + + ### Single GPU **Note:** training on a single gpu can be extremely slow. diff --git a/references/video_classification/train.py b/references/video_classification/train.py index 6cef08188af..3b5d8d8d206 100644 --- a/references/video_classification/train.py +++ b/references/video_classification/train.py @@ -13,7 +13,7 @@ import utils from scheduler import WarmupMultiStepLR -from transforms import bhwc_to_bchw, bchw_to_cbhw +from transforms import ConvertBHWCtoBCHW, ConvertBCHWtoCBHW try: from apex import amp @@ -119,13 +119,13 @@ def main(args): st = time.time() cache_path = _get_cache_path(traindir) transform_train = torchvision.transforms.Compose([ - bhwc_to_bchw, + ConvertBHWCtoBCHW(), T.ConvertImageDtype(torch.float32), T.Resize((128, 171)), T.RandomHorizontalFlip(), normalize, T.RandomCrop((112, 112)), - bchw_to_cbhw + ConvertBCHWtoCBHW() ]) if args.cache_dataset and os.path.exists(cache_path): @@ -155,12 +155,12 @@ def main(args): cache_path = _get_cache_path(valdir) transform_test = torchvision.transforms.Compose([ - bhwc_to_bchw, + ConvertBHWCtoBCHW(), T.ConvertImageDtype(torch.float32), T.Resize((128, 171)), normalize, T.CenterCrop((112, 112)), - bchw_to_cbhw + ConvertBCHWtoCBHW() ]) if args.cache_dataset and os.path.exists(cache_path): diff --git a/references/video_classification/transforms.py b/references/video_classification/transforms.py index 09d0185d4e7..27f6c75450a 100644 --- a/references/video_classification/transforms.py +++ b/references/video_classification/transforms.py @@ -1,10 +1,18 @@ -def bhwc_to_bchw(vid): +import torch +import torch.nn as nn + + +class ConvertBHWCtoBCHW(nn.Module): """Convert tensor from (B, H, W, C) to (B, C, H, W) """ - return vid.permute(0, 3, 1, 2) + def forward(self, vid: torch.Tensor) -> torch.Tensor: + return vid.permute(0, 3, 1, 2) -def bchw_to_cbhw(vid): + +class ConvertBCHWtoCBHW(nn.Module): """Convert tensor from (B, C, H, W) to (C, B, H, W) """ - return vid.permute(1, 0, 2, 3) + + def forward(self, vid: torch.Tensor) -> torch.Tensor: + return vid.permute(1, 0, 2, 3)