From f356fd875d8d9b8c5bcc8356ed00cc0ca17a4f75 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 2 Nov 2022 00:38:52 +0100 Subject: [PATCH 1/7] make docs without multiprocessing --- docs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Makefile b/docs/Makefile index 389a07a604e..474d12569d0 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -6,7 +6,7 @@ ifneq ($(EXAMPLES_PATTERN),) endif # You can set these variables from the command line. -SPHINXOPTS = -W -j auto $(EXAMPLES_PATTERN_OPTS) +SPHINXOPTS = -W $(EXAMPLES_PATTERN_OPTS) SPHINXBUILD = sphinx-build SPHINXPROJ = torchvision SOURCEDIR = source From b2f6fe1b6102322d794e47fc6923aaf009b10634 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 2 Nov 2022 13:34:07 +0100 Subject: [PATCH 2/7] debug --- docs/Makefile | 3 +- .../{plot_video_api.py => _plot_video_api.py} | 78 +++++++++---------- torchvision/io/__init__.py | 9 --- 3 files changed, 40 insertions(+), 50 deletions(-) rename gallery/{plot_video_api.py => _plot_video_api.py} (85%) diff --git a/docs/Makefile b/docs/Makefile index 474d12569d0..f462ff22303 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -6,7 +6,7 @@ ifneq ($(EXAMPLES_PATTERN),) endif # You can set these variables from the command line. -SPHINXOPTS = -W $(EXAMPLES_PATTERN_OPTS) +SPHINXOPTS = -W -j auto $(EXAMPLES_PATTERN_OPTS) SPHINXBUILD = sphinx-build SPHINXPROJ = torchvision SOURCEDIR = source @@ -33,6 +33,7 @@ clean: rm -rf $(SOURCEDIR)/auto_examples/ # sphinx-gallery rm -rf $(SOURCEDIR)/gen_modules/ # sphinx-gallery rm -rf $(SOURCEDIR)/generated/ # autosummary + rm -rf $(SOURCEDIR)/models/generated # autosummary .PHONY: help Makefile docset diff --git a/gallery/plot_video_api.py b/gallery/_plot_video_api.py similarity index 85% rename from gallery/plot_video_api.py rename to gallery/_plot_video_api.py index d83a508eabe..9b226d7c6a1 100644 --- a/gallery/plot_video_api.py +++ b/gallery/_plot_video_api.py @@ -35,9 +35,7 @@ # Download the sample video download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", - ".", - "WUzgd7C1pWA.mp4" + "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", ".", "WUzgd7C1pWA.mp4" ) video_path = "./WUzgd7C1pWA.mp4" @@ -75,12 +73,12 @@ frames = [] # we are going to save the frames here. ptss = [] # pts is a presentation timestamp in seconds (float) of each frame for frame in video: - frames.append(frame['data']) - ptss.append(frame['pts']) + frames.append(frame["data"]) + ptss.append(frame["pts"]) print("PTS for first five frames ", ptss[:5]) print("Total number of frames: ", len(frames)) -approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0] +approx_nf = metadata["audio"]["duration"][0] * metadata["audio"]["framerate"][0] print("Approx total number of datapoints we can expect: ", approx_nf) print("Read data size: ", frames[0].size(0) * len(frames)) @@ -96,6 +94,7 @@ import itertools + video.set_current_stream("video") frames = [] # we are going to save the frames here. @@ -116,11 +115,11 @@ frames = [] # we are going to save the frames here. video = video.seek(2) -for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video): - frames.append(frame['data']) +for frame in itertools.takewhile(lambda x: x["pts"] <= 5, video): + frames.append(frame["data"]) print("Total number of frames: ", len(frames)) -approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0] +approx_nf = (5 - 2) * video.get_metadata()["video"]["fps"][0] print("We can expect approx: ", approx_nf) print("Tensor size: ", frames[0].size()) @@ -135,19 +134,16 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au if end is None: end = float("inf") if end < start: - raise ValueError( - "end time should be larger than start time, got " - f"start time={start} and end time={end}" - ) + raise ValueError("end time should be larger than start time, got " f"start time={start} and end time={end}") video_frames = torch.empty(0) video_pts = [] if read_video: video_object.set_current_stream("video") frames = [] - for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): - frames.append(frame['data']) - video_pts.append(frame['pts']) + for frame in itertools.takewhile(lambda x: x["pts"] <= end, video_object.seek(start)): + frames.append(frame["data"]) + video_pts.append(frame["pts"]) if len(frames) > 0: video_frames = torch.stack(frames, 0) @@ -156,9 +152,9 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au if read_audio: video_object.set_current_stream("audio") frames = [] - for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): - frames.append(frame['data']) - audio_pts.append(frame['pts']) + for frame in itertools.takewhile(lambda x: x["pts"] <= end, video_object.seek(start)): + frames.append(frame["data"]) + audio_pts.append(frame["pts"]) if len(frames) > 0: audio_frames = torch.cat(frames, 0) @@ -179,6 +175,7 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au #################################### # Make sample dataset import os + os.makedirs("./dataset", exist_ok=True) os.makedirs("./dataset/1", exist_ok=True) os.makedirs("./dataset/2", exist_ok=True) @@ -186,29 +183,31 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au #################################### # Download the videos from torchvision.datasets.utils import download_url + download_url( "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", - "./dataset/1", "WUzgd7C1pWA.mp4" + "./dataset/1", + "WUzgd7C1pWA.mp4", ) download_url( "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true", "./dataset/1", - "RATRACE_wave_f_nm_np1_fr_goo_37.avi" + "RATRACE_wave_f_nm_np1_fr_goo_37.avi", ) download_url( "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true", "./dataset/2", - "SOX5yA1l24A.mp4" + "SOX5yA1l24A.mp4", ) download_url( "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true", "./dataset/2", - "v_SoccerJuggling_g23_c01.avi" + "v_SoccerJuggling_g23_c01.avi", ) download_url( "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true", "./dataset/2", - "v_SoccerJuggling_g24_c01.avi" + "v_SoccerJuggling_g24_c01.avi", ) #################################### @@ -231,6 +230,7 @@ def get_samples(root, extensions=(".mp4", ".avi")): _, class_to_idx = _find_classes(root) return make_dataset(root, class_to_idx, extensions=extensions) + #################################### # We are going to define the dataset and some basic arguments. # We assume the structure of the FolderDataset, and add the following parameters: @@ -269,23 +269,19 @@ def __iter__(self): video_frames = [] # video frame buffer # Seek and return frames - max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0]) - start = random.uniform(0., max_seek) + max_seek = metadata["video"]["duration"][0] - (self.clip_len / metadata["video"]["fps"][0]) + start = random.uniform(0.0, max_seek) for frame in itertools.islice(vid.seek(start), self.clip_len): - video_frames.append(self.frame_transform(frame['data'])) - current_pts = frame['pts'] + video_frames.append(self.frame_transform(frame["data"])) + current_pts = frame["pts"] # Stack it into a tensor video = torch.stack(video_frames, 0) if self.video_transform: video = self.video_transform(video) - output = { - 'path': path, - 'video': video, - 'target': target, - 'start': start, - 'end': current_pts} + output = {"path": path, "video": video, "target": target, "start": start, "end": current_pts} yield output + #################################### # Given a path of videos in a folder structure, i.e: # @@ -310,14 +306,15 @@ def __iter__(self): #################################### from torch.utils.data import DataLoader + loader = DataLoader(dataset, batch_size=12) -data = {"video": [], 'start': [], 'end': [], 'tensorsize': []} +data = {"video": [], "start": [], "end": [], "tensorsize": []} for batch in loader: - for i in range(len(batch['path'])): - data['video'].append(batch['path'][i]) - data['start'].append(batch['start'][i].item()) - data['end'].append(batch['end'][i].item()) - data['tensorsize'].append(batch['video'][i].size()) + for i in range(len(batch["path"])): + data["video"].append(batch["path"][i]) + data["start"].append(batch["start"][i].item()) + data["end"].append(batch["end"][i].item()) + data["tensorsize"].append(batch["video"][i].size()) print(data) #################################### @@ -337,5 +334,6 @@ def __iter__(self): # Cleanup the video and dataset: import os import shutil + os.remove("./WUzgd7C1pWA.mp4") shutil.rmtree("./dataset") diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py index 0787b8230e0..55b804156d1 100644 --- a/torchvision/io/__init__.py +++ b/torchvision/io/__init__.py @@ -1,9 +1,3 @@ -from typing import Any, Dict, Iterator - -import torch - -from ..utils import _log_api_usage_once - from ._video_opt import ( _HAS_VIDEO_OPT, _probe_video_from_file, @@ -43,8 +37,6 @@ "_read_video_timestamps_from_memory", "_probe_video_from_memory", "_HAS_VIDEO_OPT", - "_read_video_clip_from_memory", - "_read_video_meta_data", "VideoMetaData", "Timebase", "ImageReadMode", @@ -58,6 +50,5 @@ "write_file", "write_jpeg", "write_png", - "Video", "VideoReader", ] From 09454f0d57e36e7059c0f4b4daf4c53a7303ccb3 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 2 Nov 2022 13:36:26 +0100 Subject: [PATCH 3/7] revert unrelated style changes --- gallery/_plot_video_api.py | 339 ------------------------------------- 1 file changed, 339 deletions(-) delete mode 100644 gallery/_plot_video_api.py diff --git a/gallery/_plot_video_api.py b/gallery/_plot_video_api.py deleted file mode 100644 index 9b226d7c6a1..00000000000 --- a/gallery/_plot_video_api.py +++ /dev/null @@ -1,339 +0,0 @@ -""" -======================= -Video API -======================= - -This example illustrates some of the APIs that torchvision offers for -videos, together with the examples on how to build datasets and more. -""" - -#################################### -# 1. Introduction: building a new video object and examining the properties -# ------------------------------------------------------------------------- -# First we select a video to test the object out. For the sake of argument -# we're using one from kinetics400 dataset. -# To create it, we need to define the path and the stream we want to use. - -###################################### -# Chosen video statistics: -# -# - WUzgd7C1pWA.mp4 -# - source: -# - kinetics-400 -# - video: -# - H-264 -# - MPEG-4 AVC (part 10) (avc1) -# - fps: 29.97 -# - audio: -# - MPEG AAC audio (mp4a) -# - sample rate: 48K Hz -# - -import torch -import torchvision -from torchvision.datasets.utils import download_url - -# Download the sample video -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", ".", "WUzgd7C1pWA.mp4" -) -video_path = "./WUzgd7C1pWA.mp4" - -###################################### -# Streams are defined in a similar fashion as torch devices. We encode them as strings in a form -# of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int. -# The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered. -# Firstly, let's get the metadata for our particular video: - -stream = "video" -video = torchvision.io.VideoReader(video_path, stream) -video.get_metadata() - -###################################### -# Here we can see that video has two streams - a video and an audio stream. -# Currently available stream types include ['video', 'audio']. -# Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id -# (which are determined by video encoding). -# In this way, if the video container contains multiple streams of the same type, -# users can access the one they want. -# If only stream type is passed, the decoder auto-detects first stream of that type and returns it. - -###################################### -# Let's read all the frames from the video stream. By default, the return value of -# ``next(video_reader)`` is a dict containing the following fields. -# -# The return fields are: -# -# - ``data``: containing a torch.tensor -# - ``pts``: containing a float timestamp of this particular frame - -metadata = video.get_metadata() -video.set_current_stream("audio") - -frames = [] # we are going to save the frames here. -ptss = [] # pts is a presentation timestamp in seconds (float) of each frame -for frame in video: - frames.append(frame["data"]) - ptss.append(frame["pts"]) - -print("PTS for first five frames ", ptss[:5]) -print("Total number of frames: ", len(frames)) -approx_nf = metadata["audio"]["duration"][0] * metadata["audio"]["framerate"][0] -print("Approx total number of datapoints we can expect: ", approx_nf) -print("Read data size: ", frames[0].size(0) * len(frames)) - -###################################### -# But what if we only want to read certain time segment of the video? -# That can be done easily using the combination of our ``seek`` function, and the fact that each call -# to next returns the presentation timestamp of the returned frame in seconds. -# -# Given that our implementation relies on python iterators, -# we can leverage itertools to simplify the process and make it more pythonic. -# -# For example, if we wanted to read ten frames from second second: - - -import itertools - -video.set_current_stream("video") - -frames = [] # we are going to save the frames here. - -# We seek into a second second of the video and use islice to get 10 frames since -for frame, pts in itertools.islice(video.seek(2), 10): - frames.append(frame) - -print("Total number of frames: ", len(frames)) - -###################################### -# Or if we wanted to read from 2nd to 5th second, -# We seek into a second second of the video, -# then we utilize the itertools takewhile to get the -# correct number of frames: - -video.set_current_stream("video") -frames = [] # we are going to save the frames here. -video = video.seek(2) - -for frame in itertools.takewhile(lambda x: x["pts"] <= 5, video): - frames.append(frame["data"]) - -print("Total number of frames: ", len(frames)) -approx_nf = (5 - 2) * video.get_metadata()["video"]["fps"][0] -print("We can expect approx: ", approx_nf) -print("Tensor size: ", frames[0].size()) - -#################################### -# 2. Building a sample read_video function -# ---------------------------------------------------------------------------------------- -# We can utilize the methods above to build the read video function that follows -# the same API to the existing ``read_video`` function. - - -def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True): - if end is None: - end = float("inf") - if end < start: - raise ValueError("end time should be larger than start time, got " f"start time={start} and end time={end}") - - video_frames = torch.empty(0) - video_pts = [] - if read_video: - video_object.set_current_stream("video") - frames = [] - for frame in itertools.takewhile(lambda x: x["pts"] <= end, video_object.seek(start)): - frames.append(frame["data"]) - video_pts.append(frame["pts"]) - if len(frames) > 0: - video_frames = torch.stack(frames, 0) - - audio_frames = torch.empty(0) - audio_pts = [] - if read_audio: - video_object.set_current_stream("audio") - frames = [] - for frame in itertools.takewhile(lambda x: x["pts"] <= end, video_object.seek(start)): - frames.append(frame["data"]) - audio_pts.append(frame["pts"]) - if len(frames) > 0: - audio_frames = torch.cat(frames, 0) - - return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata() - - -# Total number of frames should be 327 for video and 523264 datapoints for audio -vf, af, info, meta = example_read_video(video) -print(vf.size(), af.size()) - -#################################### -# 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400) -# ------------------------------------------------------------------------------------------------------- -# Cool, so now we can use the same principle to make the sample dataset. -# We suggest trying out iterable dataset for this purpose. -# Here, we are going to build an example dataset that reads randomly selected 10 frames of video. - -#################################### -# Make sample dataset -import os - -os.makedirs("./dataset", exist_ok=True) -os.makedirs("./dataset/1", exist_ok=True) -os.makedirs("./dataset/2", exist_ok=True) - -#################################### -# Download the videos -from torchvision.datasets.utils import download_url - -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", - "./dataset/1", - "WUzgd7C1pWA.mp4", -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true", - "./dataset/1", - "RATRACE_wave_f_nm_np1_fr_goo_37.avi", -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true", - "./dataset/2", - "SOX5yA1l24A.mp4", -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true", - "./dataset/2", - "v_SoccerJuggling_g23_c01.avi", -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true", - "./dataset/2", - "v_SoccerJuggling_g24_c01.avi", -) - -#################################### -# Housekeeping and utilities -import os -import random - -from torchvision.datasets.folder import make_dataset -from torchvision import transforms as t - - -def _find_classes(dir): - classes = [d.name for d in os.scandir(dir) if d.is_dir()] - classes.sort() - class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)} - return classes, class_to_idx - - -def get_samples(root, extensions=(".mp4", ".avi")): - _, class_to_idx = _find_classes(root) - return make_dataset(root, class_to_idx, extensions=extensions) - - -#################################### -# We are going to define the dataset and some basic arguments. -# We assume the structure of the FolderDataset, and add the following parameters: -# -# - ``clip_len``: length of a clip in frames -# - ``frame_transform``: transform for every frame individually -# - ``video_transform``: transform on a video sequence -# -# .. note:: -# We actually add epoch size as using :func:`~torch.utils.data.IterableDataset` -# class allows us to naturally oversample clips or images from each video if needed. - - -class RandomDataset(torch.utils.data.IterableDataset): - def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16): - super(RandomDataset).__init__() - - self.samples = get_samples(root) - - # Allow for temporal jittering - if epoch_size is None: - epoch_size = len(self.samples) - self.epoch_size = epoch_size - - self.clip_len = clip_len - self.frame_transform = frame_transform - self.video_transform = video_transform - - def __iter__(self): - for i in range(self.epoch_size): - # Get random sample - path, target = random.choice(self.samples) - # Get video object - vid = torchvision.io.VideoReader(path, "video") - metadata = vid.get_metadata() - video_frames = [] # video frame buffer - - # Seek and return frames - max_seek = metadata["video"]["duration"][0] - (self.clip_len / metadata["video"]["fps"][0]) - start = random.uniform(0.0, max_seek) - for frame in itertools.islice(vid.seek(start), self.clip_len): - video_frames.append(self.frame_transform(frame["data"])) - current_pts = frame["pts"] - # Stack it into a tensor - video = torch.stack(video_frames, 0) - if self.video_transform: - video = self.video_transform(video) - output = {"path": path, "video": video, "target": target, "start": start, "end": current_pts} - yield output - - -#################################### -# Given a path of videos in a folder structure, i.e: -# -# - dataset -# - class 1 -# - file 0 -# - file 1 -# - ... -# - class 2 -# - file 0 -# - file 1 -# - ... -# - ... -# -# We can generate a dataloader and test the dataset. - - -transforms = [t.Resize((112, 112))] -frame_transform = t.Compose(transforms) - -dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform) - -#################################### -from torch.utils.data import DataLoader - -loader = DataLoader(dataset, batch_size=12) -data = {"video": [], "start": [], "end": [], "tensorsize": []} -for batch in loader: - for i in range(len(batch["path"])): - data["video"].append(batch["path"][i]) - data["start"].append(batch["start"][i].item()) - data["end"].append(batch["end"][i].item()) - data["tensorsize"].append(batch["video"][i].size()) -print(data) - -#################################### -# 4. Data Visualization -# ---------------------------------- -# Example of visualized video - -import matplotlib.pyplot as plt - -plt.figure(figsize=(12, 12)) -for i in range(16): - plt.subplot(4, 4, i + 1) - plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0)) - plt.axis("off") - -#################################### -# Cleanup the video and dataset: -import os -import shutil - -os.remove("./WUzgd7C1pWA.mp4") -shutil.rmtree("./dataset") From 7573e84ab680c230efacc5a246d06e01e256164a Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 2 Nov 2022 13:37:16 +0100 Subject: [PATCH 4/7] readd video gallery --- gallery/_plot_video_api.py | 341 +++++++++++++++++++++++++++++++++++++ 1 file changed, 341 insertions(+) create mode 100644 gallery/_plot_video_api.py diff --git a/gallery/_plot_video_api.py b/gallery/_plot_video_api.py new file mode 100644 index 00000000000..d83a508eabe --- /dev/null +++ b/gallery/_plot_video_api.py @@ -0,0 +1,341 @@ +""" +======================= +Video API +======================= + +This example illustrates some of the APIs that torchvision offers for +videos, together with the examples on how to build datasets and more. +""" + +#################################### +# 1. Introduction: building a new video object and examining the properties +# ------------------------------------------------------------------------- +# First we select a video to test the object out. For the sake of argument +# we're using one from kinetics400 dataset. +# To create it, we need to define the path and the stream we want to use. + +###################################### +# Chosen video statistics: +# +# - WUzgd7C1pWA.mp4 +# - source: +# - kinetics-400 +# - video: +# - H-264 +# - MPEG-4 AVC (part 10) (avc1) +# - fps: 29.97 +# - audio: +# - MPEG AAC audio (mp4a) +# - sample rate: 48K Hz +# + +import torch +import torchvision +from torchvision.datasets.utils import download_url + +# Download the sample video +download_url( + "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", + ".", + "WUzgd7C1pWA.mp4" +) +video_path = "./WUzgd7C1pWA.mp4" + +###################################### +# Streams are defined in a similar fashion as torch devices. We encode them as strings in a form +# of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int. +# The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered. +# Firstly, let's get the metadata for our particular video: + +stream = "video" +video = torchvision.io.VideoReader(video_path, stream) +video.get_metadata() + +###################################### +# Here we can see that video has two streams - a video and an audio stream. +# Currently available stream types include ['video', 'audio']. +# Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id +# (which are determined by video encoding). +# In this way, if the video container contains multiple streams of the same type, +# users can access the one they want. +# If only stream type is passed, the decoder auto-detects first stream of that type and returns it. + +###################################### +# Let's read all the frames from the video stream. By default, the return value of +# ``next(video_reader)`` is a dict containing the following fields. +# +# The return fields are: +# +# - ``data``: containing a torch.tensor +# - ``pts``: containing a float timestamp of this particular frame + +metadata = video.get_metadata() +video.set_current_stream("audio") + +frames = [] # we are going to save the frames here. +ptss = [] # pts is a presentation timestamp in seconds (float) of each frame +for frame in video: + frames.append(frame['data']) + ptss.append(frame['pts']) + +print("PTS for first five frames ", ptss[:5]) +print("Total number of frames: ", len(frames)) +approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0] +print("Approx total number of datapoints we can expect: ", approx_nf) +print("Read data size: ", frames[0].size(0) * len(frames)) + +###################################### +# But what if we only want to read certain time segment of the video? +# That can be done easily using the combination of our ``seek`` function, and the fact that each call +# to next returns the presentation timestamp of the returned frame in seconds. +# +# Given that our implementation relies on python iterators, +# we can leverage itertools to simplify the process and make it more pythonic. +# +# For example, if we wanted to read ten frames from second second: + + +import itertools +video.set_current_stream("video") + +frames = [] # we are going to save the frames here. + +# We seek into a second second of the video and use islice to get 10 frames since +for frame, pts in itertools.islice(video.seek(2), 10): + frames.append(frame) + +print("Total number of frames: ", len(frames)) + +###################################### +# Or if we wanted to read from 2nd to 5th second, +# We seek into a second second of the video, +# then we utilize the itertools takewhile to get the +# correct number of frames: + +video.set_current_stream("video") +frames = [] # we are going to save the frames here. +video = video.seek(2) + +for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video): + frames.append(frame['data']) + +print("Total number of frames: ", len(frames)) +approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0] +print("We can expect approx: ", approx_nf) +print("Tensor size: ", frames[0].size()) + +#################################### +# 2. Building a sample read_video function +# ---------------------------------------------------------------------------------------- +# We can utilize the methods above to build the read video function that follows +# the same API to the existing ``read_video`` function. + + +def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True): + if end is None: + end = float("inf") + if end < start: + raise ValueError( + "end time should be larger than start time, got " + f"start time={start} and end time={end}" + ) + + video_frames = torch.empty(0) + video_pts = [] + if read_video: + video_object.set_current_stream("video") + frames = [] + for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): + frames.append(frame['data']) + video_pts.append(frame['pts']) + if len(frames) > 0: + video_frames = torch.stack(frames, 0) + + audio_frames = torch.empty(0) + audio_pts = [] + if read_audio: + video_object.set_current_stream("audio") + frames = [] + for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): + frames.append(frame['data']) + audio_pts.append(frame['pts']) + if len(frames) > 0: + audio_frames = torch.cat(frames, 0) + + return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata() + + +# Total number of frames should be 327 for video and 523264 datapoints for audio +vf, af, info, meta = example_read_video(video) +print(vf.size(), af.size()) + +#################################### +# 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400) +# ------------------------------------------------------------------------------------------------------- +# Cool, so now we can use the same principle to make the sample dataset. +# We suggest trying out iterable dataset for this purpose. +# Here, we are going to build an example dataset that reads randomly selected 10 frames of video. + +#################################### +# Make sample dataset +import os +os.makedirs("./dataset", exist_ok=True) +os.makedirs("./dataset/1", exist_ok=True) +os.makedirs("./dataset/2", exist_ok=True) + +#################################### +# Download the videos +from torchvision.datasets.utils import download_url +download_url( + "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", + "./dataset/1", "WUzgd7C1pWA.mp4" +) +download_url( + "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true", + "./dataset/1", + "RATRACE_wave_f_nm_np1_fr_goo_37.avi" +) +download_url( + "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true", + "./dataset/2", + "SOX5yA1l24A.mp4" +) +download_url( + "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true", + "./dataset/2", + "v_SoccerJuggling_g23_c01.avi" +) +download_url( + "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true", + "./dataset/2", + "v_SoccerJuggling_g24_c01.avi" +) + +#################################### +# Housekeeping and utilities +import os +import random + +from torchvision.datasets.folder import make_dataset +from torchvision import transforms as t + + +def _find_classes(dir): + classes = [d.name for d in os.scandir(dir) if d.is_dir()] + classes.sort() + class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)} + return classes, class_to_idx + + +def get_samples(root, extensions=(".mp4", ".avi")): + _, class_to_idx = _find_classes(root) + return make_dataset(root, class_to_idx, extensions=extensions) + +#################################### +# We are going to define the dataset and some basic arguments. +# We assume the structure of the FolderDataset, and add the following parameters: +# +# - ``clip_len``: length of a clip in frames +# - ``frame_transform``: transform for every frame individually +# - ``video_transform``: transform on a video sequence +# +# .. note:: +# We actually add epoch size as using :func:`~torch.utils.data.IterableDataset` +# class allows us to naturally oversample clips or images from each video if needed. + + +class RandomDataset(torch.utils.data.IterableDataset): + def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16): + super(RandomDataset).__init__() + + self.samples = get_samples(root) + + # Allow for temporal jittering + if epoch_size is None: + epoch_size = len(self.samples) + self.epoch_size = epoch_size + + self.clip_len = clip_len + self.frame_transform = frame_transform + self.video_transform = video_transform + + def __iter__(self): + for i in range(self.epoch_size): + # Get random sample + path, target = random.choice(self.samples) + # Get video object + vid = torchvision.io.VideoReader(path, "video") + metadata = vid.get_metadata() + video_frames = [] # video frame buffer + + # Seek and return frames + max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0]) + start = random.uniform(0., max_seek) + for frame in itertools.islice(vid.seek(start), self.clip_len): + video_frames.append(self.frame_transform(frame['data'])) + current_pts = frame['pts'] + # Stack it into a tensor + video = torch.stack(video_frames, 0) + if self.video_transform: + video = self.video_transform(video) + output = { + 'path': path, + 'video': video, + 'target': target, + 'start': start, + 'end': current_pts} + yield output + +#################################### +# Given a path of videos in a folder structure, i.e: +# +# - dataset +# - class 1 +# - file 0 +# - file 1 +# - ... +# - class 2 +# - file 0 +# - file 1 +# - ... +# - ... +# +# We can generate a dataloader and test the dataset. + + +transforms = [t.Resize((112, 112))] +frame_transform = t.Compose(transforms) + +dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform) + +#################################### +from torch.utils.data import DataLoader +loader = DataLoader(dataset, batch_size=12) +data = {"video": [], 'start': [], 'end': [], 'tensorsize': []} +for batch in loader: + for i in range(len(batch['path'])): + data['video'].append(batch['path'][i]) + data['start'].append(batch['start'][i].item()) + data['end'].append(batch['end'][i].item()) + data['tensorsize'].append(batch['video'][i].size()) +print(data) + +#################################### +# 4. Data Visualization +# ---------------------------------- +# Example of visualized video + +import matplotlib.pyplot as plt + +plt.figure(figsize=(12, 12)) +for i in range(16): + plt.subplot(4, 4, i + 1) + plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0)) + plt.axis("off") + +#################################### +# Cleanup the video and dataset: +import os +import shutil +os.remove("./WUzgd7C1pWA.mp4") +shutil.rmtree("./dataset") From b33a68d623b9472a71d8184f357749aadf937888 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 2 Nov 2022 15:27:32 +0100 Subject: [PATCH 5/7] provide minimal reproduction --- gallery/_plot_video_api.py | 341 ------------------------------------ gallery/plot_video_api.py | 348 +++++++++++++++++++++++++++++++++++++ 2 files changed, 348 insertions(+), 341 deletions(-) delete mode 100644 gallery/_plot_video_api.py create mode 100644 gallery/plot_video_api.py diff --git a/gallery/_plot_video_api.py b/gallery/_plot_video_api.py deleted file mode 100644 index d83a508eabe..00000000000 --- a/gallery/_plot_video_api.py +++ /dev/null @@ -1,341 +0,0 @@ -""" -======================= -Video API -======================= - -This example illustrates some of the APIs that torchvision offers for -videos, together with the examples on how to build datasets and more. -""" - -#################################### -# 1. Introduction: building a new video object and examining the properties -# ------------------------------------------------------------------------- -# First we select a video to test the object out. For the sake of argument -# we're using one from kinetics400 dataset. -# To create it, we need to define the path and the stream we want to use. - -###################################### -# Chosen video statistics: -# -# - WUzgd7C1pWA.mp4 -# - source: -# - kinetics-400 -# - video: -# - H-264 -# - MPEG-4 AVC (part 10) (avc1) -# - fps: 29.97 -# - audio: -# - MPEG AAC audio (mp4a) -# - sample rate: 48K Hz -# - -import torch -import torchvision -from torchvision.datasets.utils import download_url - -# Download the sample video -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", - ".", - "WUzgd7C1pWA.mp4" -) -video_path = "./WUzgd7C1pWA.mp4" - -###################################### -# Streams are defined in a similar fashion as torch devices. We encode them as strings in a form -# of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int. -# The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered. -# Firstly, let's get the metadata for our particular video: - -stream = "video" -video = torchvision.io.VideoReader(video_path, stream) -video.get_metadata() - -###################################### -# Here we can see that video has two streams - a video and an audio stream. -# Currently available stream types include ['video', 'audio']. -# Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id -# (which are determined by video encoding). -# In this way, if the video container contains multiple streams of the same type, -# users can access the one they want. -# If only stream type is passed, the decoder auto-detects first stream of that type and returns it. - -###################################### -# Let's read all the frames from the video stream. By default, the return value of -# ``next(video_reader)`` is a dict containing the following fields. -# -# The return fields are: -# -# - ``data``: containing a torch.tensor -# - ``pts``: containing a float timestamp of this particular frame - -metadata = video.get_metadata() -video.set_current_stream("audio") - -frames = [] # we are going to save the frames here. -ptss = [] # pts is a presentation timestamp in seconds (float) of each frame -for frame in video: - frames.append(frame['data']) - ptss.append(frame['pts']) - -print("PTS for first five frames ", ptss[:5]) -print("Total number of frames: ", len(frames)) -approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0] -print("Approx total number of datapoints we can expect: ", approx_nf) -print("Read data size: ", frames[0].size(0) * len(frames)) - -###################################### -# But what if we only want to read certain time segment of the video? -# That can be done easily using the combination of our ``seek`` function, and the fact that each call -# to next returns the presentation timestamp of the returned frame in seconds. -# -# Given that our implementation relies on python iterators, -# we can leverage itertools to simplify the process and make it more pythonic. -# -# For example, if we wanted to read ten frames from second second: - - -import itertools -video.set_current_stream("video") - -frames = [] # we are going to save the frames here. - -# We seek into a second second of the video and use islice to get 10 frames since -for frame, pts in itertools.islice(video.seek(2), 10): - frames.append(frame) - -print("Total number of frames: ", len(frames)) - -###################################### -# Or if we wanted to read from 2nd to 5th second, -# We seek into a second second of the video, -# then we utilize the itertools takewhile to get the -# correct number of frames: - -video.set_current_stream("video") -frames = [] # we are going to save the frames here. -video = video.seek(2) - -for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video): - frames.append(frame['data']) - -print("Total number of frames: ", len(frames)) -approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0] -print("We can expect approx: ", approx_nf) -print("Tensor size: ", frames[0].size()) - -#################################### -# 2. Building a sample read_video function -# ---------------------------------------------------------------------------------------- -# We can utilize the methods above to build the read video function that follows -# the same API to the existing ``read_video`` function. - - -def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True): - if end is None: - end = float("inf") - if end < start: - raise ValueError( - "end time should be larger than start time, got " - f"start time={start} and end time={end}" - ) - - video_frames = torch.empty(0) - video_pts = [] - if read_video: - video_object.set_current_stream("video") - frames = [] - for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): - frames.append(frame['data']) - video_pts.append(frame['pts']) - if len(frames) > 0: - video_frames = torch.stack(frames, 0) - - audio_frames = torch.empty(0) - audio_pts = [] - if read_audio: - video_object.set_current_stream("audio") - frames = [] - for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): - frames.append(frame['data']) - audio_pts.append(frame['pts']) - if len(frames) > 0: - audio_frames = torch.cat(frames, 0) - - return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata() - - -# Total number of frames should be 327 for video and 523264 datapoints for audio -vf, af, info, meta = example_read_video(video) -print(vf.size(), af.size()) - -#################################### -# 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400) -# ------------------------------------------------------------------------------------------------------- -# Cool, so now we can use the same principle to make the sample dataset. -# We suggest trying out iterable dataset for this purpose. -# Here, we are going to build an example dataset that reads randomly selected 10 frames of video. - -#################################### -# Make sample dataset -import os -os.makedirs("./dataset", exist_ok=True) -os.makedirs("./dataset/1", exist_ok=True) -os.makedirs("./dataset/2", exist_ok=True) - -#################################### -# Download the videos -from torchvision.datasets.utils import download_url -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", - "./dataset/1", "WUzgd7C1pWA.mp4" -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true", - "./dataset/1", - "RATRACE_wave_f_nm_np1_fr_goo_37.avi" -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true", - "./dataset/2", - "SOX5yA1l24A.mp4" -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true", - "./dataset/2", - "v_SoccerJuggling_g23_c01.avi" -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true", - "./dataset/2", - "v_SoccerJuggling_g24_c01.avi" -) - -#################################### -# Housekeeping and utilities -import os -import random - -from torchvision.datasets.folder import make_dataset -from torchvision import transforms as t - - -def _find_classes(dir): - classes = [d.name for d in os.scandir(dir) if d.is_dir()] - classes.sort() - class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)} - return classes, class_to_idx - - -def get_samples(root, extensions=(".mp4", ".avi")): - _, class_to_idx = _find_classes(root) - return make_dataset(root, class_to_idx, extensions=extensions) - -#################################### -# We are going to define the dataset and some basic arguments. -# We assume the structure of the FolderDataset, and add the following parameters: -# -# - ``clip_len``: length of a clip in frames -# - ``frame_transform``: transform for every frame individually -# - ``video_transform``: transform on a video sequence -# -# .. note:: -# We actually add epoch size as using :func:`~torch.utils.data.IterableDataset` -# class allows us to naturally oversample clips or images from each video if needed. - - -class RandomDataset(torch.utils.data.IterableDataset): - def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16): - super(RandomDataset).__init__() - - self.samples = get_samples(root) - - # Allow for temporal jittering - if epoch_size is None: - epoch_size = len(self.samples) - self.epoch_size = epoch_size - - self.clip_len = clip_len - self.frame_transform = frame_transform - self.video_transform = video_transform - - def __iter__(self): - for i in range(self.epoch_size): - # Get random sample - path, target = random.choice(self.samples) - # Get video object - vid = torchvision.io.VideoReader(path, "video") - metadata = vid.get_metadata() - video_frames = [] # video frame buffer - - # Seek and return frames - max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0]) - start = random.uniform(0., max_seek) - for frame in itertools.islice(vid.seek(start), self.clip_len): - video_frames.append(self.frame_transform(frame['data'])) - current_pts = frame['pts'] - # Stack it into a tensor - video = torch.stack(video_frames, 0) - if self.video_transform: - video = self.video_transform(video) - output = { - 'path': path, - 'video': video, - 'target': target, - 'start': start, - 'end': current_pts} - yield output - -#################################### -# Given a path of videos in a folder structure, i.e: -# -# - dataset -# - class 1 -# - file 0 -# - file 1 -# - ... -# - class 2 -# - file 0 -# - file 1 -# - ... -# - ... -# -# We can generate a dataloader and test the dataset. - - -transforms = [t.Resize((112, 112))] -frame_transform = t.Compose(transforms) - -dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform) - -#################################### -from torch.utils.data import DataLoader -loader = DataLoader(dataset, batch_size=12) -data = {"video": [], 'start': [], 'end': [], 'tensorsize': []} -for batch in loader: - for i in range(len(batch['path'])): - data['video'].append(batch['path'][i]) - data['start'].append(batch['start'][i].item()) - data['end'].append(batch['end'][i].item()) - data['tensorsize'].append(batch['video'][i].size()) -print(data) - -#################################### -# 4. Data Visualization -# ---------------------------------- -# Example of visualized video - -import matplotlib.pyplot as plt - -plt.figure(figsize=(12, 12)) -for i in range(16): - plt.subplot(4, 4, i + 1) - plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0)) - plt.axis("off") - -#################################### -# Cleanup the video and dataset: -import os -import shutil -os.remove("./WUzgd7C1pWA.mp4") -shutil.rmtree("./dataset") diff --git a/gallery/plot_video_api.py b/gallery/plot_video_api.py new file mode 100644 index 00000000000..c867a61702f --- /dev/null +++ b/gallery/plot_video_api.py @@ -0,0 +1,348 @@ +""" +======================= +Video API +======================= + +This example illustrates some of the APIs that torchvision offers for +videos, together with the examples on how to build datasets and more. +""" + +#################################### +# 1. Introduction: building a new video object and examining the properties +# ------------------------------------------------------------------------- +# First we select a video to test the object out. For the sake of argument +# we're using one from kinetics400 dataset. +# To create it, we need to define the path and the stream we want to use. + +###################################### +# Chosen video statistics: +# +# - WUzgd7C1pWA.mp4 +# - source: +# - kinetics-400 +# - video: +# - H-264 +# - MPEG-4 AVC (part 10) (avc1) +# - fps: 29.97 +# - audio: +# - MPEG AAC audio (mp4a) +# - sample rate: 48K Hz +# + +import torch +import torchvision +from torchvision.datasets.utils import download_url + +# Download the sample video +download_url( + "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", + ".", + "WUzgd7C1pWA.mp4" +) +video_path = "./WUzgd7C1pWA.mp4" + +###################################### +# Streams are defined in a similar fashion as torch devices. We encode them as strings in a form +# of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int. +# The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered. +# Firstly, let's get the metadata for our particular video: + +stream = "video" +video = torchvision.io.VideoReader(video_path, stream) +video.get_metadata() + +###################################### +# Here we can see that video has two streams - a video and an audio stream. +# Currently available stream types include ['video', 'audio']. +# Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id +# (which are determined by video encoding). +# In this way, if the video container contains multiple streams of the same type, +# users can access the one they want. +# If only stream type is passed, the decoder auto-detects first stream of that type and returns it. + +###################################### +# Let's read all the frames from the video stream. By default, the return value of +# ``next(video_reader)`` is a dict containing the following fields. +# +# The return fields are: +# +# - ``data``: containing a torch.tensor +# - ``pts``: containing a float timestamp of this particular frame + +metadata = video.get_metadata() +video.set_current_stream("audio") + +frames = [] # we are going to save the frames here. +ptss = [] # pts is a presentation timestamp in seconds (float) of each frame +for frame in video: + frames.append(frame['data']) + ptss.append(frame['pts']) + +print("PTS for first five frames ", ptss[:5]) +print("Total number of frames: ", len(frames)) +approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0] +print("Approx total number of datapoints we can expect: ", approx_nf) +print("Read data size: ", frames[0].size(0) * len(frames)) + +###################################### +# But what if we only want to read certain time segment of the video? +# That can be done easily using the combination of our ``seek`` function, and the fact that each call +# to next returns the presentation timestamp of the returned frame in seconds. +# +# Given that our implementation relies on python iterators, +# we can leverage itertools to simplify the process and make it more pythonic. +# +# For example, if we wanted to read ten frames from second second: + +# FIXME: With https://github.com/pytorch/vision/pull/6598 this blocks leads to sphinx build hanging when using +# multiprocessing. +video.set_current_stream("video") + +for _ in video.seek(2): + pass + + +# import itertools +# video.set_current_stream("video") +# +# frames = [] # we are going to save the frames here. +# +# # We seek into a second second of the video and use islice to get 10 frames since +# for frame, pts in itertools.islice(video.seek(2), 10): +# frames.append(frame) +# +# print("Total number of frames: ", len(frames)) +# +# ###################################### +# # Or if we wanted to read from 2nd to 5th second, +# # We seek into a second second of the video, +# # then we utilize the itertools takewhile to get the +# # correct number of frames: +# +# video.set_current_stream("video") +# frames = [] # we are going to save the frames here. +# video = video.seek(2) +# +# for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video): +# frames.append(frame['data']) +# +# print("Total number of frames: ", len(frames)) +# approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0] +# print("We can expect approx: ", approx_nf) +# print("Tensor size: ", frames[0].size()) +# +# #################################### +# # 2. Building a sample read_video function +# # ---------------------------------------------------------------------------------------- +# # We can utilize the methods above to build the read video function that follows +# # the same API to the existing ``read_video`` function. +# +# +# def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True): +# if end is None: +# end = float("inf") +# if end < start: +# raise ValueError( +# "end time should be larger than start time, got " +# f"start time={start} and end time={end}" +# ) +# +# video_frames = torch.empty(0) +# video_pts = [] +# if read_video: +# video_object.set_current_stream("video") +# frames = [] +# for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): +# frames.append(frame['data']) +# video_pts.append(frame['pts']) +# if len(frames) > 0: +# video_frames = torch.stack(frames, 0) +# +# audio_frames = torch.empty(0) +# audio_pts = [] +# if read_audio: +# video_object.set_current_stream("audio") +# frames = [] +# for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): +# frames.append(frame['data']) +# audio_pts.append(frame['pts']) +# if len(frames) > 0: +# audio_frames = torch.cat(frames, 0) +# +# return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata() +# +# +# # Total number of frames should be 327 for video and 523264 datapoints for audio +# vf, af, info, meta = example_read_video(video) +# print(vf.size(), af.size()) +# +# #################################### +# # 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400) +# # ------------------------------------------------------------------------------------------------------- +# # Cool, so now we can use the same principle to make the sample dataset. +# # We suggest trying out iterable dataset for this purpose. +# # Here, we are going to build an example dataset that reads randomly selected 10 frames of video. +# +# #################################### +# # Make sample dataset +# import os +# os.makedirs("./dataset", exist_ok=True) +# os.makedirs("./dataset/1", exist_ok=True) +# os.makedirs("./dataset/2", exist_ok=True) +# +# #################################### +# # Download the videos +# from torchvision.datasets.utils import download_url +# download_url( +# "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", +# "./dataset/1", "WUzgd7C1pWA.mp4" +# ) +# download_url( +# "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true", +# "./dataset/1", +# "RATRACE_wave_f_nm_np1_fr_goo_37.avi" +# ) +# download_url( +# "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true", +# "./dataset/2", +# "SOX5yA1l24A.mp4" +# ) +# download_url( +# "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true", +# "./dataset/2", +# "v_SoccerJuggling_g23_c01.avi" +# ) +# download_url( +# "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true", +# "./dataset/2", +# "v_SoccerJuggling_g24_c01.avi" +# ) +# +# #################################### +# # Housekeeping and utilities +# import os +# import random +# +# from torchvision.datasets.folder import make_dataset +# from torchvision import transforms as t +# +# +# def _find_classes(dir): +# classes = [d.name for d in os.scandir(dir) if d.is_dir()] +# classes.sort() +# class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)} +# return classes, class_to_idx +# +# +# def get_samples(root, extensions=(".mp4", ".avi")): +# _, class_to_idx = _find_classes(root) +# return make_dataset(root, class_to_idx, extensions=extensions) +# +# #################################### +# # We are going to define the dataset and some basic arguments. +# # We assume the structure of the FolderDataset, and add the following parameters: +# # +# # - ``clip_len``: length of a clip in frames +# # - ``frame_transform``: transform for every frame individually +# # - ``video_transform``: transform on a video sequence +# # +# # .. note:: +# # We actually add epoch size as using :func:`~torch.utils.data.IterableDataset` +# # class allows us to naturally oversample clips or images from each video if needed. +# +# +# class RandomDataset(torch.utils.data.IterableDataset): +# def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16): +# super(RandomDataset).__init__() +# +# self.samples = get_samples(root) +# +# # Allow for temporal jittering +# if epoch_size is None: +# epoch_size = len(self.samples) +# self.epoch_size = epoch_size +# +# self.clip_len = clip_len +# self.frame_transform = frame_transform +# self.video_transform = video_transform +# +# def __iter__(self): +# for i in range(self.epoch_size): +# # Get random sample +# path, target = random.choice(self.samples) +# # Get video object +# vid = torchvision.io.VideoReader(path, "video") +# metadata = vid.get_metadata() +# video_frames = [] # video frame buffer +# +# # Seek and return frames +# max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0]) +# start = random.uniform(0., max_seek) +# for frame in itertools.islice(vid.seek(start), self.clip_len): +# video_frames.append(self.frame_transform(frame['data'])) +# current_pts = frame['pts'] +# # Stack it into a tensor +# video = torch.stack(video_frames, 0) +# if self.video_transform: +# video = self.video_transform(video) +# output = { +# 'path': path, +# 'video': video, +# 'target': target, +# 'start': start, +# 'end': current_pts} +# yield output +# +# #################################### +# # Given a path of videos in a folder structure, i.e: +# # +# # - dataset +# # - class 1 +# # - file 0 +# # - file 1 +# # - ... +# # - class 2 +# # - file 0 +# # - file 1 +# # - ... +# # - ... +# # +# # We can generate a dataloader and test the dataset. +# +# +# transforms = [t.Resize((112, 112))] +# frame_transform = t.Compose(transforms) +# +# dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform) +# +# #################################### +# from torch.utils.data import DataLoader +# loader = DataLoader(dataset, batch_size=12) +# data = {"video": [], 'start': [], 'end': [], 'tensorsize': []} +# for batch in loader: +# for i in range(len(batch['path'])): +# data['video'].append(batch['path'][i]) +# data['start'].append(batch['start'][i].item()) +# data['end'].append(batch['end'][i].item()) +# data['tensorsize'].append(batch['video'][i].size()) +# print(data) +# +# #################################### +# # 4. Data Visualization +# # ---------------------------------- +# # Example of visualized video +# +# import matplotlib.pyplot as plt +# +# plt.figure(figsize=(12, 12)) +# for i in range(16): +# plt.subplot(4, 4, i + 1) +# plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0)) +# plt.axis("off") +# +# #################################### +# # Cleanup the video and dataset: +# import os +# import shutil +# os.remove("./WUzgd7C1pWA.mp4") +# shutil.rmtree("./dataset") From 027a90058d00942b9b18c59b5d0f96ac1d321e37 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 2 Nov 2022 15:31:30 +0100 Subject: [PATCH 6/7] remove instead of comment --- gallery/plot_video_api.py | 246 -------------------------------------- 1 file changed, 246 deletions(-) diff --git a/gallery/plot_video_api.py b/gallery/plot_video_api.py index c867a61702f..02d37eaafad 100644 --- a/gallery/plot_video_api.py +++ b/gallery/plot_video_api.py @@ -100,249 +100,3 @@ for _ in video.seek(2): pass - - -# import itertools -# video.set_current_stream("video") -# -# frames = [] # we are going to save the frames here. -# -# # We seek into a second second of the video and use islice to get 10 frames since -# for frame, pts in itertools.islice(video.seek(2), 10): -# frames.append(frame) -# -# print("Total number of frames: ", len(frames)) -# -# ###################################### -# # Or if we wanted to read from 2nd to 5th second, -# # We seek into a second second of the video, -# # then we utilize the itertools takewhile to get the -# # correct number of frames: -# -# video.set_current_stream("video") -# frames = [] # we are going to save the frames here. -# video = video.seek(2) -# -# for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video): -# frames.append(frame['data']) -# -# print("Total number of frames: ", len(frames)) -# approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0] -# print("We can expect approx: ", approx_nf) -# print("Tensor size: ", frames[0].size()) -# -# #################################### -# # 2. Building a sample read_video function -# # ---------------------------------------------------------------------------------------- -# # We can utilize the methods above to build the read video function that follows -# # the same API to the existing ``read_video`` function. -# -# -# def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True): -# if end is None: -# end = float("inf") -# if end < start: -# raise ValueError( -# "end time should be larger than start time, got " -# f"start time={start} and end time={end}" -# ) -# -# video_frames = torch.empty(0) -# video_pts = [] -# if read_video: -# video_object.set_current_stream("video") -# frames = [] -# for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): -# frames.append(frame['data']) -# video_pts.append(frame['pts']) -# if len(frames) > 0: -# video_frames = torch.stack(frames, 0) -# -# audio_frames = torch.empty(0) -# audio_pts = [] -# if read_audio: -# video_object.set_current_stream("audio") -# frames = [] -# for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): -# frames.append(frame['data']) -# audio_pts.append(frame['pts']) -# if len(frames) > 0: -# audio_frames = torch.cat(frames, 0) -# -# return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata() -# -# -# # Total number of frames should be 327 for video and 523264 datapoints for audio -# vf, af, info, meta = example_read_video(video) -# print(vf.size(), af.size()) -# -# #################################### -# # 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400) -# # ------------------------------------------------------------------------------------------------------- -# # Cool, so now we can use the same principle to make the sample dataset. -# # We suggest trying out iterable dataset for this purpose. -# # Here, we are going to build an example dataset that reads randomly selected 10 frames of video. -# -# #################################### -# # Make sample dataset -# import os -# os.makedirs("./dataset", exist_ok=True) -# os.makedirs("./dataset/1", exist_ok=True) -# os.makedirs("./dataset/2", exist_ok=True) -# -# #################################### -# # Download the videos -# from torchvision.datasets.utils import download_url -# download_url( -# "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", -# "./dataset/1", "WUzgd7C1pWA.mp4" -# ) -# download_url( -# "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true", -# "./dataset/1", -# "RATRACE_wave_f_nm_np1_fr_goo_37.avi" -# ) -# download_url( -# "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true", -# "./dataset/2", -# "SOX5yA1l24A.mp4" -# ) -# download_url( -# "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true", -# "./dataset/2", -# "v_SoccerJuggling_g23_c01.avi" -# ) -# download_url( -# "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true", -# "./dataset/2", -# "v_SoccerJuggling_g24_c01.avi" -# ) -# -# #################################### -# # Housekeeping and utilities -# import os -# import random -# -# from torchvision.datasets.folder import make_dataset -# from torchvision import transforms as t -# -# -# def _find_classes(dir): -# classes = [d.name for d in os.scandir(dir) if d.is_dir()] -# classes.sort() -# class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)} -# return classes, class_to_idx -# -# -# def get_samples(root, extensions=(".mp4", ".avi")): -# _, class_to_idx = _find_classes(root) -# return make_dataset(root, class_to_idx, extensions=extensions) -# -# #################################### -# # We are going to define the dataset and some basic arguments. -# # We assume the structure of the FolderDataset, and add the following parameters: -# # -# # - ``clip_len``: length of a clip in frames -# # - ``frame_transform``: transform for every frame individually -# # - ``video_transform``: transform on a video sequence -# # -# # .. note:: -# # We actually add epoch size as using :func:`~torch.utils.data.IterableDataset` -# # class allows us to naturally oversample clips or images from each video if needed. -# -# -# class RandomDataset(torch.utils.data.IterableDataset): -# def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16): -# super(RandomDataset).__init__() -# -# self.samples = get_samples(root) -# -# # Allow for temporal jittering -# if epoch_size is None: -# epoch_size = len(self.samples) -# self.epoch_size = epoch_size -# -# self.clip_len = clip_len -# self.frame_transform = frame_transform -# self.video_transform = video_transform -# -# def __iter__(self): -# for i in range(self.epoch_size): -# # Get random sample -# path, target = random.choice(self.samples) -# # Get video object -# vid = torchvision.io.VideoReader(path, "video") -# metadata = vid.get_metadata() -# video_frames = [] # video frame buffer -# -# # Seek and return frames -# max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0]) -# start = random.uniform(0., max_seek) -# for frame in itertools.islice(vid.seek(start), self.clip_len): -# video_frames.append(self.frame_transform(frame['data'])) -# current_pts = frame['pts'] -# # Stack it into a tensor -# video = torch.stack(video_frames, 0) -# if self.video_transform: -# video = self.video_transform(video) -# output = { -# 'path': path, -# 'video': video, -# 'target': target, -# 'start': start, -# 'end': current_pts} -# yield output -# -# #################################### -# # Given a path of videos in a folder structure, i.e: -# # -# # - dataset -# # - class 1 -# # - file 0 -# # - file 1 -# # - ... -# # - class 2 -# # - file 0 -# # - file 1 -# # - ... -# # - ... -# # -# # We can generate a dataloader and test the dataset. -# -# -# transforms = [t.Resize((112, 112))] -# frame_transform = t.Compose(transforms) -# -# dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform) -# -# #################################### -# from torch.utils.data import DataLoader -# loader = DataLoader(dataset, batch_size=12) -# data = {"video": [], 'start': [], 'end': [], 'tensorsize': []} -# for batch in loader: -# for i in range(len(batch['path'])): -# data['video'].append(batch['path'][i]) -# data['start'].append(batch['start'][i].item()) -# data['end'].append(batch['end'][i].item()) -# data['tensorsize'].append(batch['video'][i].size()) -# print(data) -# -# #################################### -# # 4. Data Visualization -# # ---------------------------------- -# # Example of visualized video -# -# import matplotlib.pyplot as plt -# -# plt.figure(figsize=(12, 12)) -# for i in range(16): -# plt.subplot(4, 4, i + 1) -# plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0)) -# plt.axis("off") -# -# #################################### -# # Cleanup the video and dataset: -# import os -# import shutil -# os.remove("./WUzgd7C1pWA.mp4") -# shutil.rmtree("./dataset") From c85fef95c4f2b73ca2f9064f3b282046803c156b Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 3 Nov 2022 22:04:10 +0100 Subject: [PATCH 7/7] revert video api gallery --- gallery/plot_video_api.py | 247 +++++++++++++++++++++++++++++++++++++- 1 file changed, 243 insertions(+), 4 deletions(-) diff --git a/gallery/plot_video_api.py b/gallery/plot_video_api.py index 02d37eaafad..d83a508eabe 100644 --- a/gallery/plot_video_api.py +++ b/gallery/plot_video_api.py @@ -94,9 +94,248 @@ # # For example, if we wanted to read ten frames from second second: -# FIXME: With https://github.com/pytorch/vision/pull/6598 this blocks leads to sphinx build hanging when using -# multiprocessing. + +import itertools +video.set_current_stream("video") + +frames = [] # we are going to save the frames here. + +# We seek into a second second of the video and use islice to get 10 frames since +for frame, pts in itertools.islice(video.seek(2), 10): + frames.append(frame) + +print("Total number of frames: ", len(frames)) + +###################################### +# Or if we wanted to read from 2nd to 5th second, +# We seek into a second second of the video, +# then we utilize the itertools takewhile to get the +# correct number of frames: + video.set_current_stream("video") +frames = [] # we are going to save the frames here. +video = video.seek(2) + +for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video): + frames.append(frame['data']) + +print("Total number of frames: ", len(frames)) +approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0] +print("We can expect approx: ", approx_nf) +print("Tensor size: ", frames[0].size()) + +#################################### +# 2. Building a sample read_video function +# ---------------------------------------------------------------------------------------- +# We can utilize the methods above to build the read video function that follows +# the same API to the existing ``read_video`` function. + + +def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True): + if end is None: + end = float("inf") + if end < start: + raise ValueError( + "end time should be larger than start time, got " + f"start time={start} and end time={end}" + ) + + video_frames = torch.empty(0) + video_pts = [] + if read_video: + video_object.set_current_stream("video") + frames = [] + for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): + frames.append(frame['data']) + video_pts.append(frame['pts']) + if len(frames) > 0: + video_frames = torch.stack(frames, 0) + + audio_frames = torch.empty(0) + audio_pts = [] + if read_audio: + video_object.set_current_stream("audio") + frames = [] + for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): + frames.append(frame['data']) + audio_pts.append(frame['pts']) + if len(frames) > 0: + audio_frames = torch.cat(frames, 0) + + return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata() + + +# Total number of frames should be 327 for video and 523264 datapoints for audio +vf, af, info, meta = example_read_video(video) +print(vf.size(), af.size()) + +#################################### +# 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400) +# ------------------------------------------------------------------------------------------------------- +# Cool, so now we can use the same principle to make the sample dataset. +# We suggest trying out iterable dataset for this purpose. +# Here, we are going to build an example dataset that reads randomly selected 10 frames of video. + +#################################### +# Make sample dataset +import os +os.makedirs("./dataset", exist_ok=True) +os.makedirs("./dataset/1", exist_ok=True) +os.makedirs("./dataset/2", exist_ok=True) + +#################################### +# Download the videos +from torchvision.datasets.utils import download_url +download_url( + "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", + "./dataset/1", "WUzgd7C1pWA.mp4" +) +download_url( + "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true", + "./dataset/1", + "RATRACE_wave_f_nm_np1_fr_goo_37.avi" +) +download_url( + "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true", + "./dataset/2", + "SOX5yA1l24A.mp4" +) +download_url( + "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true", + "./dataset/2", + "v_SoccerJuggling_g23_c01.avi" +) +download_url( + "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true", + "./dataset/2", + "v_SoccerJuggling_g24_c01.avi" +) + +#################################### +# Housekeeping and utilities +import os +import random + +from torchvision.datasets.folder import make_dataset +from torchvision import transforms as t + + +def _find_classes(dir): + classes = [d.name for d in os.scandir(dir) if d.is_dir()] + classes.sort() + class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)} + return classes, class_to_idx + + +def get_samples(root, extensions=(".mp4", ".avi")): + _, class_to_idx = _find_classes(root) + return make_dataset(root, class_to_idx, extensions=extensions) + +#################################### +# We are going to define the dataset and some basic arguments. +# We assume the structure of the FolderDataset, and add the following parameters: +# +# - ``clip_len``: length of a clip in frames +# - ``frame_transform``: transform for every frame individually +# - ``video_transform``: transform on a video sequence +# +# .. note:: +# We actually add epoch size as using :func:`~torch.utils.data.IterableDataset` +# class allows us to naturally oversample clips or images from each video if needed. + + +class RandomDataset(torch.utils.data.IterableDataset): + def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16): + super(RandomDataset).__init__() + + self.samples = get_samples(root) -for _ in video.seek(2): - pass + # Allow for temporal jittering + if epoch_size is None: + epoch_size = len(self.samples) + self.epoch_size = epoch_size + + self.clip_len = clip_len + self.frame_transform = frame_transform + self.video_transform = video_transform + + def __iter__(self): + for i in range(self.epoch_size): + # Get random sample + path, target = random.choice(self.samples) + # Get video object + vid = torchvision.io.VideoReader(path, "video") + metadata = vid.get_metadata() + video_frames = [] # video frame buffer + + # Seek and return frames + max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0]) + start = random.uniform(0., max_seek) + for frame in itertools.islice(vid.seek(start), self.clip_len): + video_frames.append(self.frame_transform(frame['data'])) + current_pts = frame['pts'] + # Stack it into a tensor + video = torch.stack(video_frames, 0) + if self.video_transform: + video = self.video_transform(video) + output = { + 'path': path, + 'video': video, + 'target': target, + 'start': start, + 'end': current_pts} + yield output + +#################################### +# Given a path of videos in a folder structure, i.e: +# +# - dataset +# - class 1 +# - file 0 +# - file 1 +# - ... +# - class 2 +# - file 0 +# - file 1 +# - ... +# - ... +# +# We can generate a dataloader and test the dataset. + + +transforms = [t.Resize((112, 112))] +frame_transform = t.Compose(transforms) + +dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform) + +#################################### +from torch.utils.data import DataLoader +loader = DataLoader(dataset, batch_size=12) +data = {"video": [], 'start': [], 'end': [], 'tensorsize': []} +for batch in loader: + for i in range(len(batch['path'])): + data['video'].append(batch['path'][i]) + data['start'].append(batch['start'][i].item()) + data['end'].append(batch['end'][i].item()) + data['tensorsize'].append(batch['video'][i].size()) +print(data) + +#################################### +# 4. Data Visualization +# ---------------------------------- +# Example of visualized video + +import matplotlib.pyplot as plt + +plt.figure(figsize=(12, 12)) +for i in range(16): + plt.subplot(4, 4, i + 1) + plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0)) + plt.axis("off") + +#################################### +# Cleanup the video and dataset: +import os +import shutil +os.remove("./WUzgd7C1pWA.mp4") +shutil.rmtree("./dataset")