From f356fd875d8d9b8c5bcc8356ed00cc0ca17a4f75 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 2 Nov 2022 00:38:52 +0100
Subject: [PATCH 1/7] make docs without multiprocessing

---
 docs/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Makefile b/docs/Makefile
index 389a07a604e..474d12569d0 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -6,7 +6,7 @@ ifneq ($(EXAMPLES_PATTERN),)
 endif
 
 # You can set these variables from the command line.
-SPHINXOPTS    = -W -j auto $(EXAMPLES_PATTERN_OPTS)
+SPHINXOPTS    = -W $(EXAMPLES_PATTERN_OPTS)
 SPHINXBUILD   = sphinx-build
 SPHINXPROJ    = torchvision
 SOURCEDIR     = source

From b2f6fe1b6102322d794e47fc6923aaf009b10634 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 2 Nov 2022 13:34:07 +0100
Subject: [PATCH 2/7] debug

---
 docs/Makefile                                 |  3 +-
 .../{plot_video_api.py => _plot_video_api.py} | 78 +++++++++----------
 torchvision/io/__init__.py                    |  9 ---
 3 files changed, 40 insertions(+), 50 deletions(-)
 rename gallery/{plot_video_api.py => _plot_video_api.py} (85%)

diff --git a/docs/Makefile b/docs/Makefile
index 474d12569d0..f462ff22303 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -6,7 +6,7 @@ ifneq ($(EXAMPLES_PATTERN),)
 endif
 
 # You can set these variables from the command line.
-SPHINXOPTS    = -W $(EXAMPLES_PATTERN_OPTS)
+SPHINXOPTS    = -W -j auto $(EXAMPLES_PATTERN_OPTS)
 SPHINXBUILD   = sphinx-build
 SPHINXPROJ    = torchvision
 SOURCEDIR     = source
@@ -33,6 +33,7 @@ clean:
 	rm -rf $(SOURCEDIR)/auto_examples/  # sphinx-gallery
 	rm -rf $(SOURCEDIR)/gen_modules/  # sphinx-gallery
 	rm -rf $(SOURCEDIR)/generated/  # autosummary
+	rm -rf $(SOURCEDIR)/models/generated  # autosummary
 
 .PHONY: help Makefile docset
 
diff --git a/gallery/plot_video_api.py b/gallery/_plot_video_api.py
similarity index 85%
rename from gallery/plot_video_api.py
rename to gallery/_plot_video_api.py
index d83a508eabe..9b226d7c6a1 100644
--- a/gallery/plot_video_api.py
+++ b/gallery/_plot_video_api.py
@@ -35,9 +35,7 @@
 
 # Download the sample video
 download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
-    ".",
-    "WUzgd7C1pWA.mp4"
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", ".", "WUzgd7C1pWA.mp4"
 )
 video_path = "./WUzgd7C1pWA.mp4"
 
@@ -75,12 +73,12 @@
 frames = []  # we are going to save the frames here.
 ptss = []  # pts is a presentation timestamp in seconds (float) of each frame
 for frame in video:
-    frames.append(frame['data'])
-    ptss.append(frame['pts'])
+    frames.append(frame["data"])
+    ptss.append(frame["pts"])
 
 print("PTS for first five frames ", ptss[:5])
 print("Total number of frames: ", len(frames))
-approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0]
+approx_nf = metadata["audio"]["duration"][0] * metadata["audio"]["framerate"][0]
 print("Approx total number of datapoints we can expect: ", approx_nf)
 print("Read data size: ", frames[0].size(0) * len(frames))
 
@@ -96,6 +94,7 @@
 
 
 import itertools
+
 video.set_current_stream("video")
 
 frames = []  # we are going to save the frames here.
@@ -116,11 +115,11 @@
 frames = []  # we are going to save the frames here.
 video = video.seek(2)
 
-for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video):
-    frames.append(frame['data'])
+for frame in itertools.takewhile(lambda x: x["pts"] <= 5, video):
+    frames.append(frame["data"])
 
 print("Total number of frames: ", len(frames))
-approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0]
+approx_nf = (5 - 2) * video.get_metadata()["video"]["fps"][0]
 print("We can expect approx: ", approx_nf)
 print("Tensor size: ", frames[0].size())
 
@@ -135,19 +134,16 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au
     if end is None:
         end = float("inf")
     if end < start:
-        raise ValueError(
-            "end time should be larger than start time, got "
-            f"start time={start} and end time={end}"
-        )
+        raise ValueError("end time should be larger than start time, got " f"start time={start} and end time={end}")
 
     video_frames = torch.empty(0)
     video_pts = []
     if read_video:
         video_object.set_current_stream("video")
         frames = []
-        for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
-            frames.append(frame['data'])
-            video_pts.append(frame['pts'])
+        for frame in itertools.takewhile(lambda x: x["pts"] <= end, video_object.seek(start)):
+            frames.append(frame["data"])
+            video_pts.append(frame["pts"])
         if len(frames) > 0:
             video_frames = torch.stack(frames, 0)
 
@@ -156,9 +152,9 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au
     if read_audio:
         video_object.set_current_stream("audio")
         frames = []
-        for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
-            frames.append(frame['data'])
-            audio_pts.append(frame['pts'])
+        for frame in itertools.takewhile(lambda x: x["pts"] <= end, video_object.seek(start)):
+            frames.append(frame["data"])
+            audio_pts.append(frame["pts"])
         if len(frames) > 0:
             audio_frames = torch.cat(frames, 0)
 
@@ -179,6 +175,7 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au
 ####################################
 # Make sample dataset
 import os
+
 os.makedirs("./dataset", exist_ok=True)
 os.makedirs("./dataset/1", exist_ok=True)
 os.makedirs("./dataset/2", exist_ok=True)
@@ -186,29 +183,31 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au
 ####################################
 # Download the videos
 from torchvision.datasets.utils import download_url
+
 download_url(
     "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
-    "./dataset/1", "WUzgd7C1pWA.mp4"
+    "./dataset/1",
+    "WUzgd7C1pWA.mp4",
 )
 download_url(
     "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true",
     "./dataset/1",
-    "RATRACE_wave_f_nm_np1_fr_goo_37.avi"
+    "RATRACE_wave_f_nm_np1_fr_goo_37.avi",
 )
 download_url(
     "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true",
     "./dataset/2",
-    "SOX5yA1l24A.mp4"
+    "SOX5yA1l24A.mp4",
 )
 download_url(
     "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true",
     "./dataset/2",
-    "v_SoccerJuggling_g23_c01.avi"
+    "v_SoccerJuggling_g23_c01.avi",
 )
 download_url(
     "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true",
     "./dataset/2",
-    "v_SoccerJuggling_g24_c01.avi"
+    "v_SoccerJuggling_g24_c01.avi",
 )
 
 ####################################
@@ -231,6 +230,7 @@ def get_samples(root, extensions=(".mp4", ".avi")):
     _, class_to_idx = _find_classes(root)
     return make_dataset(root, class_to_idx, extensions=extensions)
 
+
 ####################################
 # We are going to define the dataset and some basic arguments.
 # We assume the structure of the FolderDataset, and add the following parameters:
@@ -269,23 +269,19 @@ def __iter__(self):
             video_frames = []  # video frame buffer
 
             # Seek and return frames
-            max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0])
-            start = random.uniform(0., max_seek)
+            max_seek = metadata["video"]["duration"][0] - (self.clip_len / metadata["video"]["fps"][0])
+            start = random.uniform(0.0, max_seek)
             for frame in itertools.islice(vid.seek(start), self.clip_len):
-                video_frames.append(self.frame_transform(frame['data']))
-                current_pts = frame['pts']
+                video_frames.append(self.frame_transform(frame["data"]))
+                current_pts = frame["pts"]
             # Stack it into a tensor
             video = torch.stack(video_frames, 0)
             if self.video_transform:
                 video = self.video_transform(video)
-            output = {
-                'path': path,
-                'video': video,
-                'target': target,
-                'start': start,
-                'end': current_pts}
+            output = {"path": path, "video": video, "target": target, "start": start, "end": current_pts}
             yield output
 
+
 ####################################
 # Given a path of videos in a folder structure, i.e:
 #
@@ -310,14 +306,15 @@ def __iter__(self):
 
 ####################################
 from torch.utils.data import DataLoader
+
 loader = DataLoader(dataset, batch_size=12)
-data = {"video": [], 'start': [], 'end': [], 'tensorsize': []}
+data = {"video": [], "start": [], "end": [], "tensorsize": []}
 for batch in loader:
-    for i in range(len(batch['path'])):
-        data['video'].append(batch['path'][i])
-        data['start'].append(batch['start'][i].item())
-        data['end'].append(batch['end'][i].item())
-        data['tensorsize'].append(batch['video'][i].size())
+    for i in range(len(batch["path"])):
+        data["video"].append(batch["path"][i])
+        data["start"].append(batch["start"][i].item())
+        data["end"].append(batch["end"][i].item())
+        data["tensorsize"].append(batch["video"][i].size())
 print(data)
 
 ####################################
@@ -337,5 +334,6 @@ def __iter__(self):
 # Cleanup the video and dataset:
 import os
 import shutil
+
 os.remove("./WUzgd7C1pWA.mp4")
 shutil.rmtree("./dataset")
diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py
index 0787b8230e0..55b804156d1 100644
--- a/torchvision/io/__init__.py
+++ b/torchvision/io/__init__.py
@@ -1,9 +1,3 @@
-from typing import Any, Dict, Iterator
-
-import torch
-
-from ..utils import _log_api_usage_once
-
 from ._video_opt import (
     _HAS_VIDEO_OPT,
     _probe_video_from_file,
@@ -43,8 +37,6 @@
     "_read_video_timestamps_from_memory",
     "_probe_video_from_memory",
     "_HAS_VIDEO_OPT",
-    "_read_video_clip_from_memory",
-    "_read_video_meta_data",
     "VideoMetaData",
     "Timebase",
     "ImageReadMode",
@@ -58,6 +50,5 @@
     "write_file",
     "write_jpeg",
     "write_png",
-    "Video",
     "VideoReader",
 ]

From 09454f0d57e36e7059c0f4b4daf4c53a7303ccb3 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 2 Nov 2022 13:36:26 +0100
Subject: [PATCH 3/7] revert unrelated style changes

---
 gallery/_plot_video_api.py | 339 -------------------------------------
 1 file changed, 339 deletions(-)
 delete mode 100644 gallery/_plot_video_api.py

diff --git a/gallery/_plot_video_api.py b/gallery/_plot_video_api.py
deleted file mode 100644
index 9b226d7c6a1..00000000000
--- a/gallery/_plot_video_api.py
+++ /dev/null
@@ -1,339 +0,0 @@
-"""
-=======================
-Video API
-=======================
-
-This example illustrates some of the APIs that torchvision offers for
-videos, together with the examples on how to build datasets and more.
-"""
-
-####################################
-# 1. Introduction: building a new video object and examining the properties
-# -------------------------------------------------------------------------
-# First we select a video to test the object out. For the sake of argument
-# we're using one from kinetics400 dataset.
-# To create it, we need to define the path and the stream we want to use.
-
-######################################
-# Chosen video statistics:
-#
-# - WUzgd7C1pWA.mp4
-#     - source:
-#         - kinetics-400
-#     - video:
-#         - H-264
-#         - MPEG-4 AVC (part 10) (avc1)
-#         - fps: 29.97
-#     - audio:
-#         - MPEG AAC audio (mp4a)
-#         - sample rate: 48K Hz
-#
-
-import torch
-import torchvision
-from torchvision.datasets.utils import download_url
-
-# Download the sample video
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", ".", "WUzgd7C1pWA.mp4"
-)
-video_path = "./WUzgd7C1pWA.mp4"
-
-######################################
-# Streams are defined in a similar fashion as torch devices. We encode them as strings in a form
-# of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int.
-# The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered.
-# Firstly, let's get the metadata for our particular video:
-
-stream = "video"
-video = torchvision.io.VideoReader(video_path, stream)
-video.get_metadata()
-
-######################################
-# Here we can see that video has two streams - a video and an audio stream.
-# Currently available stream types include ['video', 'audio'].
-# Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id
-# (which are determined by video encoding).
-# In this way, if the video container contains multiple streams of the same type,
-# users can access the one they want.
-# If only stream type is passed, the decoder auto-detects first stream of that type and returns it.
-
-######################################
-# Let's read all the frames from the video stream. By default, the return value of
-# ``next(video_reader)`` is a dict containing the following fields.
-#
-# The return fields are:
-#
-# - ``data``: containing a torch.tensor
-# - ``pts``: containing a float timestamp of this particular frame
-
-metadata = video.get_metadata()
-video.set_current_stream("audio")
-
-frames = []  # we are going to save the frames here.
-ptss = []  # pts is a presentation timestamp in seconds (float) of each frame
-for frame in video:
-    frames.append(frame["data"])
-    ptss.append(frame["pts"])
-
-print("PTS for first five frames ", ptss[:5])
-print("Total number of frames: ", len(frames))
-approx_nf = metadata["audio"]["duration"][0] * metadata["audio"]["framerate"][0]
-print("Approx total number of datapoints we can expect: ", approx_nf)
-print("Read data size: ", frames[0].size(0) * len(frames))
-
-######################################
-# But what if we only want to read certain time segment of the video?
-# That can be done easily using the combination of our ``seek`` function, and the fact that each call
-# to next returns the presentation timestamp of the returned frame in seconds.
-#
-# Given that our implementation relies on python iterators,
-# we can leverage itertools to simplify the process and make it more pythonic.
-#
-# For example, if we wanted to read ten frames from second second:
-
-
-import itertools
-
-video.set_current_stream("video")
-
-frames = []  # we are going to save the frames here.
-
-# We seek into a second second of the video and use islice to get 10 frames since
-for frame, pts in itertools.islice(video.seek(2), 10):
-    frames.append(frame)
-
-print("Total number of frames: ", len(frames))
-
-######################################
-# Or if we wanted to read from 2nd to 5th second,
-# We seek into a second second of the video,
-# then we utilize the itertools takewhile to get the
-# correct number of frames:
-
-video.set_current_stream("video")
-frames = []  # we are going to save the frames here.
-video = video.seek(2)
-
-for frame in itertools.takewhile(lambda x: x["pts"] <= 5, video):
-    frames.append(frame["data"])
-
-print("Total number of frames: ", len(frames))
-approx_nf = (5 - 2) * video.get_metadata()["video"]["fps"][0]
-print("We can expect approx: ", approx_nf)
-print("Tensor size: ", frames[0].size())
-
-####################################
-# 2. Building a sample read_video function
-# ----------------------------------------------------------------------------------------
-# We can utilize the methods above to build the read video function that follows
-# the same API to the existing ``read_video`` function.
-
-
-def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True):
-    if end is None:
-        end = float("inf")
-    if end < start:
-        raise ValueError("end time should be larger than start time, got " f"start time={start} and end time={end}")
-
-    video_frames = torch.empty(0)
-    video_pts = []
-    if read_video:
-        video_object.set_current_stream("video")
-        frames = []
-        for frame in itertools.takewhile(lambda x: x["pts"] <= end, video_object.seek(start)):
-            frames.append(frame["data"])
-            video_pts.append(frame["pts"])
-        if len(frames) > 0:
-            video_frames = torch.stack(frames, 0)
-
-    audio_frames = torch.empty(0)
-    audio_pts = []
-    if read_audio:
-        video_object.set_current_stream("audio")
-        frames = []
-        for frame in itertools.takewhile(lambda x: x["pts"] <= end, video_object.seek(start)):
-            frames.append(frame["data"])
-            audio_pts.append(frame["pts"])
-        if len(frames) > 0:
-            audio_frames = torch.cat(frames, 0)
-
-    return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata()
-
-
-# Total number of frames should be 327 for video and 523264 datapoints for audio
-vf, af, info, meta = example_read_video(video)
-print(vf.size(), af.size())
-
-####################################
-# 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400)
-# -------------------------------------------------------------------------------------------------------
-# Cool, so now we can use the same principle to make the sample dataset.
-# We suggest trying out iterable dataset for this purpose.
-# Here, we are going to build an example dataset that reads randomly selected 10 frames of video.
-
-####################################
-# Make sample dataset
-import os
-
-os.makedirs("./dataset", exist_ok=True)
-os.makedirs("./dataset/1", exist_ok=True)
-os.makedirs("./dataset/2", exist_ok=True)
-
-####################################
-# Download the videos
-from torchvision.datasets.utils import download_url
-
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
-    "./dataset/1",
-    "WUzgd7C1pWA.mp4",
-)
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true",
-    "./dataset/1",
-    "RATRACE_wave_f_nm_np1_fr_goo_37.avi",
-)
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true",
-    "./dataset/2",
-    "SOX5yA1l24A.mp4",
-)
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true",
-    "./dataset/2",
-    "v_SoccerJuggling_g23_c01.avi",
-)
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true",
-    "./dataset/2",
-    "v_SoccerJuggling_g24_c01.avi",
-)
-
-####################################
-# Housekeeping and utilities
-import os
-import random
-
-from torchvision.datasets.folder import make_dataset
-from torchvision import transforms as t
-
-
-def _find_classes(dir):
-    classes = [d.name for d in os.scandir(dir) if d.is_dir()]
-    classes.sort()
-    class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
-    return classes, class_to_idx
-
-
-def get_samples(root, extensions=(".mp4", ".avi")):
-    _, class_to_idx = _find_classes(root)
-    return make_dataset(root, class_to_idx, extensions=extensions)
-
-
-####################################
-# We are going to define the dataset and some basic arguments.
-# We assume the structure of the FolderDataset, and add the following parameters:
-#
-# - ``clip_len``: length of a clip in frames
-# - ``frame_transform``: transform for every frame individually
-# - ``video_transform``: transform on a video sequence
-#
-# .. note::
-#   We actually add epoch size as using :func:`~torch.utils.data.IterableDataset`
-#   class allows us to naturally oversample clips or images from each video if needed.
-
-
-class RandomDataset(torch.utils.data.IterableDataset):
-    def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16):
-        super(RandomDataset).__init__()
-
-        self.samples = get_samples(root)
-
-        # Allow for temporal jittering
-        if epoch_size is None:
-            epoch_size = len(self.samples)
-        self.epoch_size = epoch_size
-
-        self.clip_len = clip_len
-        self.frame_transform = frame_transform
-        self.video_transform = video_transform
-
-    def __iter__(self):
-        for i in range(self.epoch_size):
-            # Get random sample
-            path, target = random.choice(self.samples)
-            # Get video object
-            vid = torchvision.io.VideoReader(path, "video")
-            metadata = vid.get_metadata()
-            video_frames = []  # video frame buffer
-
-            # Seek and return frames
-            max_seek = metadata["video"]["duration"][0] - (self.clip_len / metadata["video"]["fps"][0])
-            start = random.uniform(0.0, max_seek)
-            for frame in itertools.islice(vid.seek(start), self.clip_len):
-                video_frames.append(self.frame_transform(frame["data"]))
-                current_pts = frame["pts"]
-            # Stack it into a tensor
-            video = torch.stack(video_frames, 0)
-            if self.video_transform:
-                video = self.video_transform(video)
-            output = {"path": path, "video": video, "target": target, "start": start, "end": current_pts}
-            yield output
-
-
-####################################
-# Given a path of videos in a folder structure, i.e:
-#
-# - dataset
-#     - class 1
-#         - file 0
-#         - file 1
-#         - ...
-#     - class 2
-#         - file 0
-#         - file 1
-#         - ...
-#     - ...
-#
-# We can generate a dataloader and test the dataset.
-
-
-transforms = [t.Resize((112, 112))]
-frame_transform = t.Compose(transforms)
-
-dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform)
-
-####################################
-from torch.utils.data import DataLoader
-
-loader = DataLoader(dataset, batch_size=12)
-data = {"video": [], "start": [], "end": [], "tensorsize": []}
-for batch in loader:
-    for i in range(len(batch["path"])):
-        data["video"].append(batch["path"][i])
-        data["start"].append(batch["start"][i].item())
-        data["end"].append(batch["end"][i].item())
-        data["tensorsize"].append(batch["video"][i].size())
-print(data)
-
-####################################
-# 4. Data Visualization
-# ----------------------------------
-# Example of visualized video
-
-import matplotlib.pyplot as plt
-
-plt.figure(figsize=(12, 12))
-for i in range(16):
-    plt.subplot(4, 4, i + 1)
-    plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0))
-    plt.axis("off")
-
-####################################
-# Cleanup the video and dataset:
-import os
-import shutil
-
-os.remove("./WUzgd7C1pWA.mp4")
-shutil.rmtree("./dataset")

From 7573e84ab680c230efacc5a246d06e01e256164a Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 2 Nov 2022 13:37:16 +0100
Subject: [PATCH 4/7] readd video gallery

---
 gallery/_plot_video_api.py | 341 +++++++++++++++++++++++++++++++++++++
 1 file changed, 341 insertions(+)
 create mode 100644 gallery/_plot_video_api.py

diff --git a/gallery/_plot_video_api.py b/gallery/_plot_video_api.py
new file mode 100644
index 00000000000..d83a508eabe
--- /dev/null
+++ b/gallery/_plot_video_api.py
@@ -0,0 +1,341 @@
+"""
+=======================
+Video API
+=======================
+
+This example illustrates some of the APIs that torchvision offers for
+videos, together with the examples on how to build datasets and more.
+"""
+
+####################################
+# 1. Introduction: building a new video object and examining the properties
+# -------------------------------------------------------------------------
+# First we select a video to test the object out. For the sake of argument
+# we're using one from kinetics400 dataset.
+# To create it, we need to define the path and the stream we want to use.
+
+######################################
+# Chosen video statistics:
+#
+# - WUzgd7C1pWA.mp4
+#     - source:
+#         - kinetics-400
+#     - video:
+#         - H-264
+#         - MPEG-4 AVC (part 10) (avc1)
+#         - fps: 29.97
+#     - audio:
+#         - MPEG AAC audio (mp4a)
+#         - sample rate: 48K Hz
+#
+
+import torch
+import torchvision
+from torchvision.datasets.utils import download_url
+
+# Download the sample video
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
+    ".",
+    "WUzgd7C1pWA.mp4"
+)
+video_path = "./WUzgd7C1pWA.mp4"
+
+######################################
+# Streams are defined in a similar fashion as torch devices. We encode them as strings in a form
+# of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int.
+# The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered.
+# Firstly, let's get the metadata for our particular video:
+
+stream = "video"
+video = torchvision.io.VideoReader(video_path, stream)
+video.get_metadata()
+
+######################################
+# Here we can see that video has two streams - a video and an audio stream.
+# Currently available stream types include ['video', 'audio'].
+# Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id
+# (which are determined by video encoding).
+# In this way, if the video container contains multiple streams of the same type,
+# users can access the one they want.
+# If only stream type is passed, the decoder auto-detects first stream of that type and returns it.
+
+######################################
+# Let's read all the frames from the video stream. By default, the return value of
+# ``next(video_reader)`` is a dict containing the following fields.
+#
+# The return fields are:
+#
+# - ``data``: containing a torch.tensor
+# - ``pts``: containing a float timestamp of this particular frame
+
+metadata = video.get_metadata()
+video.set_current_stream("audio")
+
+frames = []  # we are going to save the frames here.
+ptss = []  # pts is a presentation timestamp in seconds (float) of each frame
+for frame in video:
+    frames.append(frame['data'])
+    ptss.append(frame['pts'])
+
+print("PTS for first five frames ", ptss[:5])
+print("Total number of frames: ", len(frames))
+approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0]
+print("Approx total number of datapoints we can expect: ", approx_nf)
+print("Read data size: ", frames[0].size(0) * len(frames))
+
+######################################
+# But what if we only want to read certain time segment of the video?
+# That can be done easily using the combination of our ``seek`` function, and the fact that each call
+# to next returns the presentation timestamp of the returned frame in seconds.
+#
+# Given that our implementation relies on python iterators,
+# we can leverage itertools to simplify the process and make it more pythonic.
+#
+# For example, if we wanted to read ten frames from second second:
+
+
+import itertools
+video.set_current_stream("video")
+
+frames = []  # we are going to save the frames here.
+
+# We seek into a second second of the video and use islice to get 10 frames since
+for frame, pts in itertools.islice(video.seek(2), 10):
+    frames.append(frame)
+
+print("Total number of frames: ", len(frames))
+
+######################################
+# Or if we wanted to read from 2nd to 5th second,
+# We seek into a second second of the video,
+# then we utilize the itertools takewhile to get the
+# correct number of frames:
+
+video.set_current_stream("video")
+frames = []  # we are going to save the frames here.
+video = video.seek(2)
+
+for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video):
+    frames.append(frame['data'])
+
+print("Total number of frames: ", len(frames))
+approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0]
+print("We can expect approx: ", approx_nf)
+print("Tensor size: ", frames[0].size())
+
+####################################
+# 2. Building a sample read_video function
+# ----------------------------------------------------------------------------------------
+# We can utilize the methods above to build the read video function that follows
+# the same API to the existing ``read_video`` function.
+
+
+def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True):
+    if end is None:
+        end = float("inf")
+    if end < start:
+        raise ValueError(
+            "end time should be larger than start time, got "
+            f"start time={start} and end time={end}"
+        )
+
+    video_frames = torch.empty(0)
+    video_pts = []
+    if read_video:
+        video_object.set_current_stream("video")
+        frames = []
+        for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
+            frames.append(frame['data'])
+            video_pts.append(frame['pts'])
+        if len(frames) > 0:
+            video_frames = torch.stack(frames, 0)
+
+    audio_frames = torch.empty(0)
+    audio_pts = []
+    if read_audio:
+        video_object.set_current_stream("audio")
+        frames = []
+        for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
+            frames.append(frame['data'])
+            audio_pts.append(frame['pts'])
+        if len(frames) > 0:
+            audio_frames = torch.cat(frames, 0)
+
+    return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata()
+
+
+# Total number of frames should be 327 for video and 523264 datapoints for audio
+vf, af, info, meta = example_read_video(video)
+print(vf.size(), af.size())
+
+####################################
+# 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400)
+# -------------------------------------------------------------------------------------------------------
+# Cool, so now we can use the same principle to make the sample dataset.
+# We suggest trying out iterable dataset for this purpose.
+# Here, we are going to build an example dataset that reads randomly selected 10 frames of video.
+
+####################################
+# Make sample dataset
+import os
+os.makedirs("./dataset", exist_ok=True)
+os.makedirs("./dataset/1", exist_ok=True)
+os.makedirs("./dataset/2", exist_ok=True)
+
+####################################
+# Download the videos
+from torchvision.datasets.utils import download_url
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
+    "./dataset/1", "WUzgd7C1pWA.mp4"
+)
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true",
+    "./dataset/1",
+    "RATRACE_wave_f_nm_np1_fr_goo_37.avi"
+)
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true",
+    "./dataset/2",
+    "SOX5yA1l24A.mp4"
+)
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true",
+    "./dataset/2",
+    "v_SoccerJuggling_g23_c01.avi"
+)
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true",
+    "./dataset/2",
+    "v_SoccerJuggling_g24_c01.avi"
+)
+
+####################################
+# Housekeeping and utilities
+import os
+import random
+
+from torchvision.datasets.folder import make_dataset
+from torchvision import transforms as t
+
+
+def _find_classes(dir):
+    classes = [d.name for d in os.scandir(dir) if d.is_dir()]
+    classes.sort()
+    class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
+    return classes, class_to_idx
+
+
+def get_samples(root, extensions=(".mp4", ".avi")):
+    _, class_to_idx = _find_classes(root)
+    return make_dataset(root, class_to_idx, extensions=extensions)
+
+####################################
+# We are going to define the dataset and some basic arguments.
+# We assume the structure of the FolderDataset, and add the following parameters:
+#
+# - ``clip_len``: length of a clip in frames
+# - ``frame_transform``: transform for every frame individually
+# - ``video_transform``: transform on a video sequence
+#
+# .. note::
+#   We actually add epoch size as using :func:`~torch.utils.data.IterableDataset`
+#   class allows us to naturally oversample clips or images from each video if needed.
+
+
+class RandomDataset(torch.utils.data.IterableDataset):
+    def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16):
+        super(RandomDataset).__init__()
+
+        self.samples = get_samples(root)
+
+        # Allow for temporal jittering
+        if epoch_size is None:
+            epoch_size = len(self.samples)
+        self.epoch_size = epoch_size
+
+        self.clip_len = clip_len
+        self.frame_transform = frame_transform
+        self.video_transform = video_transform
+
+    def __iter__(self):
+        for i in range(self.epoch_size):
+            # Get random sample
+            path, target = random.choice(self.samples)
+            # Get video object
+            vid = torchvision.io.VideoReader(path, "video")
+            metadata = vid.get_metadata()
+            video_frames = []  # video frame buffer
+
+            # Seek and return frames
+            max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0])
+            start = random.uniform(0., max_seek)
+            for frame in itertools.islice(vid.seek(start), self.clip_len):
+                video_frames.append(self.frame_transform(frame['data']))
+                current_pts = frame['pts']
+            # Stack it into a tensor
+            video = torch.stack(video_frames, 0)
+            if self.video_transform:
+                video = self.video_transform(video)
+            output = {
+                'path': path,
+                'video': video,
+                'target': target,
+                'start': start,
+                'end': current_pts}
+            yield output
+
+####################################
+# Given a path of videos in a folder structure, i.e:
+#
+# - dataset
+#     - class 1
+#         - file 0
+#         - file 1
+#         - ...
+#     - class 2
+#         - file 0
+#         - file 1
+#         - ...
+#     - ...
+#
+# We can generate a dataloader and test the dataset.
+
+
+transforms = [t.Resize((112, 112))]
+frame_transform = t.Compose(transforms)
+
+dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform)
+
+####################################
+from torch.utils.data import DataLoader
+loader = DataLoader(dataset, batch_size=12)
+data = {"video": [], 'start': [], 'end': [], 'tensorsize': []}
+for batch in loader:
+    for i in range(len(batch['path'])):
+        data['video'].append(batch['path'][i])
+        data['start'].append(batch['start'][i].item())
+        data['end'].append(batch['end'][i].item())
+        data['tensorsize'].append(batch['video'][i].size())
+print(data)
+
+####################################
+# 4. Data Visualization
+# ----------------------------------
+# Example of visualized video
+
+import matplotlib.pyplot as plt
+
+plt.figure(figsize=(12, 12))
+for i in range(16):
+    plt.subplot(4, 4, i + 1)
+    plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0))
+    plt.axis("off")
+
+####################################
+# Cleanup the video and dataset:
+import os
+import shutil
+os.remove("./WUzgd7C1pWA.mp4")
+shutil.rmtree("./dataset")

From b33a68d623b9472a71d8184f357749aadf937888 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 2 Nov 2022 15:27:32 +0100
Subject: [PATCH 5/7] provide minimal reproduction

---
 gallery/_plot_video_api.py | 341 ------------------------------------
 gallery/plot_video_api.py  | 348 +++++++++++++++++++++++++++++++++++++
 2 files changed, 348 insertions(+), 341 deletions(-)
 delete mode 100644 gallery/_plot_video_api.py
 create mode 100644 gallery/plot_video_api.py

diff --git a/gallery/_plot_video_api.py b/gallery/_plot_video_api.py
deleted file mode 100644
index d83a508eabe..00000000000
--- a/gallery/_plot_video_api.py
+++ /dev/null
@@ -1,341 +0,0 @@
-"""
-=======================
-Video API
-=======================
-
-This example illustrates some of the APIs that torchvision offers for
-videos, together with the examples on how to build datasets and more.
-"""
-
-####################################
-# 1. Introduction: building a new video object and examining the properties
-# -------------------------------------------------------------------------
-# First we select a video to test the object out. For the sake of argument
-# we're using one from kinetics400 dataset.
-# To create it, we need to define the path and the stream we want to use.
-
-######################################
-# Chosen video statistics:
-#
-# - WUzgd7C1pWA.mp4
-#     - source:
-#         - kinetics-400
-#     - video:
-#         - H-264
-#         - MPEG-4 AVC (part 10) (avc1)
-#         - fps: 29.97
-#     - audio:
-#         - MPEG AAC audio (mp4a)
-#         - sample rate: 48K Hz
-#
-
-import torch
-import torchvision
-from torchvision.datasets.utils import download_url
-
-# Download the sample video
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
-    ".",
-    "WUzgd7C1pWA.mp4"
-)
-video_path = "./WUzgd7C1pWA.mp4"
-
-######################################
-# Streams are defined in a similar fashion as torch devices. We encode them as strings in a form
-# of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int.
-# The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered.
-# Firstly, let's get the metadata for our particular video:
-
-stream = "video"
-video = torchvision.io.VideoReader(video_path, stream)
-video.get_metadata()
-
-######################################
-# Here we can see that video has two streams - a video and an audio stream.
-# Currently available stream types include ['video', 'audio'].
-# Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id
-# (which are determined by video encoding).
-# In this way, if the video container contains multiple streams of the same type,
-# users can access the one they want.
-# If only stream type is passed, the decoder auto-detects first stream of that type and returns it.
-
-######################################
-# Let's read all the frames from the video stream. By default, the return value of
-# ``next(video_reader)`` is a dict containing the following fields.
-#
-# The return fields are:
-#
-# - ``data``: containing a torch.tensor
-# - ``pts``: containing a float timestamp of this particular frame
-
-metadata = video.get_metadata()
-video.set_current_stream("audio")
-
-frames = []  # we are going to save the frames here.
-ptss = []  # pts is a presentation timestamp in seconds (float) of each frame
-for frame in video:
-    frames.append(frame['data'])
-    ptss.append(frame['pts'])
-
-print("PTS for first five frames ", ptss[:5])
-print("Total number of frames: ", len(frames))
-approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0]
-print("Approx total number of datapoints we can expect: ", approx_nf)
-print("Read data size: ", frames[0].size(0) * len(frames))
-
-######################################
-# But what if we only want to read certain time segment of the video?
-# That can be done easily using the combination of our ``seek`` function, and the fact that each call
-# to next returns the presentation timestamp of the returned frame in seconds.
-#
-# Given that our implementation relies on python iterators,
-# we can leverage itertools to simplify the process and make it more pythonic.
-#
-# For example, if we wanted to read ten frames from second second:
-
-
-import itertools
-video.set_current_stream("video")
-
-frames = []  # we are going to save the frames here.
-
-# We seek into a second second of the video and use islice to get 10 frames since
-for frame, pts in itertools.islice(video.seek(2), 10):
-    frames.append(frame)
-
-print("Total number of frames: ", len(frames))
-
-######################################
-# Or if we wanted to read from 2nd to 5th second,
-# We seek into a second second of the video,
-# then we utilize the itertools takewhile to get the
-# correct number of frames:
-
-video.set_current_stream("video")
-frames = []  # we are going to save the frames here.
-video = video.seek(2)
-
-for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video):
-    frames.append(frame['data'])
-
-print("Total number of frames: ", len(frames))
-approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0]
-print("We can expect approx: ", approx_nf)
-print("Tensor size: ", frames[0].size())
-
-####################################
-# 2. Building a sample read_video function
-# ----------------------------------------------------------------------------------------
-# We can utilize the methods above to build the read video function that follows
-# the same API to the existing ``read_video`` function.
-
-
-def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True):
-    if end is None:
-        end = float("inf")
-    if end < start:
-        raise ValueError(
-            "end time should be larger than start time, got "
-            f"start time={start} and end time={end}"
-        )
-
-    video_frames = torch.empty(0)
-    video_pts = []
-    if read_video:
-        video_object.set_current_stream("video")
-        frames = []
-        for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
-            frames.append(frame['data'])
-            video_pts.append(frame['pts'])
-        if len(frames) > 0:
-            video_frames = torch.stack(frames, 0)
-
-    audio_frames = torch.empty(0)
-    audio_pts = []
-    if read_audio:
-        video_object.set_current_stream("audio")
-        frames = []
-        for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
-            frames.append(frame['data'])
-            audio_pts.append(frame['pts'])
-        if len(frames) > 0:
-            audio_frames = torch.cat(frames, 0)
-
-    return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata()
-
-
-# Total number of frames should be 327 for video and 523264 datapoints for audio
-vf, af, info, meta = example_read_video(video)
-print(vf.size(), af.size())
-
-####################################
-# 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400)
-# -------------------------------------------------------------------------------------------------------
-# Cool, so now we can use the same principle to make the sample dataset.
-# We suggest trying out iterable dataset for this purpose.
-# Here, we are going to build an example dataset that reads randomly selected 10 frames of video.
-
-####################################
-# Make sample dataset
-import os
-os.makedirs("./dataset", exist_ok=True)
-os.makedirs("./dataset/1", exist_ok=True)
-os.makedirs("./dataset/2", exist_ok=True)
-
-####################################
-# Download the videos
-from torchvision.datasets.utils import download_url
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
-    "./dataset/1", "WUzgd7C1pWA.mp4"
-)
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true",
-    "./dataset/1",
-    "RATRACE_wave_f_nm_np1_fr_goo_37.avi"
-)
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true",
-    "./dataset/2",
-    "SOX5yA1l24A.mp4"
-)
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true",
-    "./dataset/2",
-    "v_SoccerJuggling_g23_c01.avi"
-)
-download_url(
-    "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true",
-    "./dataset/2",
-    "v_SoccerJuggling_g24_c01.avi"
-)
-
-####################################
-# Housekeeping and utilities
-import os
-import random
-
-from torchvision.datasets.folder import make_dataset
-from torchvision import transforms as t
-
-
-def _find_classes(dir):
-    classes = [d.name for d in os.scandir(dir) if d.is_dir()]
-    classes.sort()
-    class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
-    return classes, class_to_idx
-
-
-def get_samples(root, extensions=(".mp4", ".avi")):
-    _, class_to_idx = _find_classes(root)
-    return make_dataset(root, class_to_idx, extensions=extensions)
-
-####################################
-# We are going to define the dataset and some basic arguments.
-# We assume the structure of the FolderDataset, and add the following parameters:
-#
-# - ``clip_len``: length of a clip in frames
-# - ``frame_transform``: transform for every frame individually
-# - ``video_transform``: transform on a video sequence
-#
-# .. note::
-#   We actually add epoch size as using :func:`~torch.utils.data.IterableDataset`
-#   class allows us to naturally oversample clips or images from each video if needed.
-
-
-class RandomDataset(torch.utils.data.IterableDataset):
-    def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16):
-        super(RandomDataset).__init__()
-
-        self.samples = get_samples(root)
-
-        # Allow for temporal jittering
-        if epoch_size is None:
-            epoch_size = len(self.samples)
-        self.epoch_size = epoch_size
-
-        self.clip_len = clip_len
-        self.frame_transform = frame_transform
-        self.video_transform = video_transform
-
-    def __iter__(self):
-        for i in range(self.epoch_size):
-            # Get random sample
-            path, target = random.choice(self.samples)
-            # Get video object
-            vid = torchvision.io.VideoReader(path, "video")
-            metadata = vid.get_metadata()
-            video_frames = []  # video frame buffer
-
-            # Seek and return frames
-            max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0])
-            start = random.uniform(0., max_seek)
-            for frame in itertools.islice(vid.seek(start), self.clip_len):
-                video_frames.append(self.frame_transform(frame['data']))
-                current_pts = frame['pts']
-            # Stack it into a tensor
-            video = torch.stack(video_frames, 0)
-            if self.video_transform:
-                video = self.video_transform(video)
-            output = {
-                'path': path,
-                'video': video,
-                'target': target,
-                'start': start,
-                'end': current_pts}
-            yield output
-
-####################################
-# Given a path of videos in a folder structure, i.e:
-#
-# - dataset
-#     - class 1
-#         - file 0
-#         - file 1
-#         - ...
-#     - class 2
-#         - file 0
-#         - file 1
-#         - ...
-#     - ...
-#
-# We can generate a dataloader and test the dataset.
-
-
-transforms = [t.Resize((112, 112))]
-frame_transform = t.Compose(transforms)
-
-dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform)
-
-####################################
-from torch.utils.data import DataLoader
-loader = DataLoader(dataset, batch_size=12)
-data = {"video": [], 'start': [], 'end': [], 'tensorsize': []}
-for batch in loader:
-    for i in range(len(batch['path'])):
-        data['video'].append(batch['path'][i])
-        data['start'].append(batch['start'][i].item())
-        data['end'].append(batch['end'][i].item())
-        data['tensorsize'].append(batch['video'][i].size())
-print(data)
-
-####################################
-# 4. Data Visualization
-# ----------------------------------
-# Example of visualized video
-
-import matplotlib.pyplot as plt
-
-plt.figure(figsize=(12, 12))
-for i in range(16):
-    plt.subplot(4, 4, i + 1)
-    plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0))
-    plt.axis("off")
-
-####################################
-# Cleanup the video and dataset:
-import os
-import shutil
-os.remove("./WUzgd7C1pWA.mp4")
-shutil.rmtree("./dataset")
diff --git a/gallery/plot_video_api.py b/gallery/plot_video_api.py
new file mode 100644
index 00000000000..c867a61702f
--- /dev/null
+++ b/gallery/plot_video_api.py
@@ -0,0 +1,348 @@
+"""
+=======================
+Video API
+=======================
+
+This example illustrates some of the APIs that torchvision offers for
+videos, together with the examples on how to build datasets and more.
+"""
+
+####################################
+# 1. Introduction: building a new video object and examining the properties
+# -------------------------------------------------------------------------
+# First we select a video to test the object out. For the sake of argument
+# we're using one from kinetics400 dataset.
+# To create it, we need to define the path and the stream we want to use.
+
+######################################
+# Chosen video statistics:
+#
+# - WUzgd7C1pWA.mp4
+#     - source:
+#         - kinetics-400
+#     - video:
+#         - H-264
+#         - MPEG-4 AVC (part 10) (avc1)
+#         - fps: 29.97
+#     - audio:
+#         - MPEG AAC audio (mp4a)
+#         - sample rate: 48K Hz
+#
+
+import torch
+import torchvision
+from torchvision.datasets.utils import download_url
+
+# Download the sample video
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
+    ".",
+    "WUzgd7C1pWA.mp4"
+)
+video_path = "./WUzgd7C1pWA.mp4"
+
+######################################
+# Streams are defined in a similar fashion as torch devices. We encode them as strings in a form
+# of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int.
+# The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered.
+# Firstly, let's get the metadata for our particular video:
+
+stream = "video"
+video = torchvision.io.VideoReader(video_path, stream)
+video.get_metadata()
+
+######################################
+# Here we can see that video has two streams - a video and an audio stream.
+# Currently available stream types include ['video', 'audio'].
+# Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id
+# (which are determined by video encoding).
+# In this way, if the video container contains multiple streams of the same type,
+# users can access the one they want.
+# If only stream type is passed, the decoder auto-detects first stream of that type and returns it.
+
+######################################
+# Let's read all the frames from the video stream. By default, the return value of
+# ``next(video_reader)`` is a dict containing the following fields.
+#
+# The return fields are:
+#
+# - ``data``: containing a torch.tensor
+# - ``pts``: containing a float timestamp of this particular frame
+
+metadata = video.get_metadata()
+video.set_current_stream("audio")
+
+frames = []  # we are going to save the frames here.
+ptss = []  # pts is a presentation timestamp in seconds (float) of each frame
+for frame in video:
+    frames.append(frame['data'])
+    ptss.append(frame['pts'])
+
+print("PTS for first five frames ", ptss[:5])
+print("Total number of frames: ", len(frames))
+approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0]
+print("Approx total number of datapoints we can expect: ", approx_nf)
+print("Read data size: ", frames[0].size(0) * len(frames))
+
+######################################
+# But what if we only want to read certain time segment of the video?
+# That can be done easily using the combination of our ``seek`` function, and the fact that each call
+# to next returns the presentation timestamp of the returned frame in seconds.
+#
+# Given that our implementation relies on python iterators,
+# we can leverage itertools to simplify the process and make it more pythonic.
+#
+# For example, if we wanted to read ten frames from second second:
+
+# FIXME: With https://github.com/pytorch/vision/pull/6598 this blocks leads to sphinx build hanging when using
+#  multiprocessing.
+video.set_current_stream("video")
+
+for _ in video.seek(2):
+    pass
+
+
+# import itertools
+# video.set_current_stream("video")
+#
+# frames = []  # we are going to save the frames here.
+#
+# # We seek into a second second of the video and use islice to get 10 frames since
+# for frame, pts in itertools.islice(video.seek(2), 10):
+#     frames.append(frame)
+#
+# print("Total number of frames: ", len(frames))
+#
+# ######################################
+# # Or if we wanted to read from 2nd to 5th second,
+# # We seek into a second second of the video,
+# # then we utilize the itertools takewhile to get the
+# # correct number of frames:
+#
+# video.set_current_stream("video")
+# frames = []  # we are going to save the frames here.
+# video = video.seek(2)
+#
+# for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video):
+#     frames.append(frame['data'])
+#
+# print("Total number of frames: ", len(frames))
+# approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0]
+# print("We can expect approx: ", approx_nf)
+# print("Tensor size: ", frames[0].size())
+#
+# ####################################
+# # 2. Building a sample read_video function
+# # ----------------------------------------------------------------------------------------
+# # We can utilize the methods above to build the read video function that follows
+# # the same API to the existing ``read_video`` function.
+#
+#
+# def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True):
+#     if end is None:
+#         end = float("inf")
+#     if end < start:
+#         raise ValueError(
+#             "end time should be larger than start time, got "
+#             f"start time={start} and end time={end}"
+#         )
+#
+#     video_frames = torch.empty(0)
+#     video_pts = []
+#     if read_video:
+#         video_object.set_current_stream("video")
+#         frames = []
+#         for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
+#             frames.append(frame['data'])
+#             video_pts.append(frame['pts'])
+#         if len(frames) > 0:
+#             video_frames = torch.stack(frames, 0)
+#
+#     audio_frames = torch.empty(0)
+#     audio_pts = []
+#     if read_audio:
+#         video_object.set_current_stream("audio")
+#         frames = []
+#         for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
+#             frames.append(frame['data'])
+#             audio_pts.append(frame['pts'])
+#         if len(frames) > 0:
+#             audio_frames = torch.cat(frames, 0)
+#
+#     return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata()
+#
+#
+# # Total number of frames should be 327 for video and 523264 datapoints for audio
+# vf, af, info, meta = example_read_video(video)
+# print(vf.size(), af.size())
+#
+# ####################################
+# # 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400)
+# # -------------------------------------------------------------------------------------------------------
+# # Cool, so now we can use the same principle to make the sample dataset.
+# # We suggest trying out iterable dataset for this purpose.
+# # Here, we are going to build an example dataset that reads randomly selected 10 frames of video.
+#
+# ####################################
+# # Make sample dataset
+# import os
+# os.makedirs("./dataset", exist_ok=True)
+# os.makedirs("./dataset/1", exist_ok=True)
+# os.makedirs("./dataset/2", exist_ok=True)
+#
+# ####################################
+# # Download the videos
+# from torchvision.datasets.utils import download_url
+# download_url(
+#     "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
+#     "./dataset/1", "WUzgd7C1pWA.mp4"
+# )
+# download_url(
+#     "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true",
+#     "./dataset/1",
+#     "RATRACE_wave_f_nm_np1_fr_goo_37.avi"
+# )
+# download_url(
+#     "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true",
+#     "./dataset/2",
+#     "SOX5yA1l24A.mp4"
+# )
+# download_url(
+#     "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true",
+#     "./dataset/2",
+#     "v_SoccerJuggling_g23_c01.avi"
+# )
+# download_url(
+#     "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true",
+#     "./dataset/2",
+#     "v_SoccerJuggling_g24_c01.avi"
+# )
+#
+# ####################################
+# # Housekeeping and utilities
+# import os
+# import random
+#
+# from torchvision.datasets.folder import make_dataset
+# from torchvision import transforms as t
+#
+#
+# def _find_classes(dir):
+#     classes = [d.name for d in os.scandir(dir) if d.is_dir()]
+#     classes.sort()
+#     class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
+#     return classes, class_to_idx
+#
+#
+# def get_samples(root, extensions=(".mp4", ".avi")):
+#     _, class_to_idx = _find_classes(root)
+#     return make_dataset(root, class_to_idx, extensions=extensions)
+#
+# ####################################
+# # We are going to define the dataset and some basic arguments.
+# # We assume the structure of the FolderDataset, and add the following parameters:
+# #
+# # - ``clip_len``: length of a clip in frames
+# # - ``frame_transform``: transform for every frame individually
+# # - ``video_transform``: transform on a video sequence
+# #
+# # .. note::
+# #   We actually add epoch size as using :func:`~torch.utils.data.IterableDataset`
+# #   class allows us to naturally oversample clips or images from each video if needed.
+#
+#
+# class RandomDataset(torch.utils.data.IterableDataset):
+#     def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16):
+#         super(RandomDataset).__init__()
+#
+#         self.samples = get_samples(root)
+#
+#         # Allow for temporal jittering
+#         if epoch_size is None:
+#             epoch_size = len(self.samples)
+#         self.epoch_size = epoch_size
+#
+#         self.clip_len = clip_len
+#         self.frame_transform = frame_transform
+#         self.video_transform = video_transform
+#
+#     def __iter__(self):
+#         for i in range(self.epoch_size):
+#             # Get random sample
+#             path, target = random.choice(self.samples)
+#             # Get video object
+#             vid = torchvision.io.VideoReader(path, "video")
+#             metadata = vid.get_metadata()
+#             video_frames = []  # video frame buffer
+#
+#             # Seek and return frames
+#             max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0])
+#             start = random.uniform(0., max_seek)
+#             for frame in itertools.islice(vid.seek(start), self.clip_len):
+#                 video_frames.append(self.frame_transform(frame['data']))
+#                 current_pts = frame['pts']
+#             # Stack it into a tensor
+#             video = torch.stack(video_frames, 0)
+#             if self.video_transform:
+#                 video = self.video_transform(video)
+#             output = {
+#                 'path': path,
+#                 'video': video,
+#                 'target': target,
+#                 'start': start,
+#                 'end': current_pts}
+#             yield output
+#
+# ####################################
+# # Given a path of videos in a folder structure, i.e:
+# #
+# # - dataset
+# #     - class 1
+# #         - file 0
+# #         - file 1
+# #         - ...
+# #     - class 2
+# #         - file 0
+# #         - file 1
+# #         - ...
+# #     - ...
+# #
+# # We can generate a dataloader and test the dataset.
+#
+#
+# transforms = [t.Resize((112, 112))]
+# frame_transform = t.Compose(transforms)
+#
+# dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform)
+#
+# ####################################
+# from torch.utils.data import DataLoader
+# loader = DataLoader(dataset, batch_size=12)
+# data = {"video": [], 'start': [], 'end': [], 'tensorsize': []}
+# for batch in loader:
+#     for i in range(len(batch['path'])):
+#         data['video'].append(batch['path'][i])
+#         data['start'].append(batch['start'][i].item())
+#         data['end'].append(batch['end'][i].item())
+#         data['tensorsize'].append(batch['video'][i].size())
+# print(data)
+#
+# ####################################
+# # 4. Data Visualization
+# # ----------------------------------
+# # Example of visualized video
+#
+# import matplotlib.pyplot as plt
+#
+# plt.figure(figsize=(12, 12))
+# for i in range(16):
+#     plt.subplot(4, 4, i + 1)
+#     plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0))
+#     plt.axis("off")
+#
+# ####################################
+# # Cleanup the video and dataset:
+# import os
+# import shutil
+# os.remove("./WUzgd7C1pWA.mp4")
+# shutil.rmtree("./dataset")

From 027a90058d00942b9b18c59b5d0f96ac1d321e37 Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Wed, 2 Nov 2022 15:31:30 +0100
Subject: [PATCH 6/7] remove instead of comment

---
 gallery/plot_video_api.py | 246 --------------------------------------
 1 file changed, 246 deletions(-)

diff --git a/gallery/plot_video_api.py b/gallery/plot_video_api.py
index c867a61702f..02d37eaafad 100644
--- a/gallery/plot_video_api.py
+++ b/gallery/plot_video_api.py
@@ -100,249 +100,3 @@
 
 for _ in video.seek(2):
     pass
-
-
-# import itertools
-# video.set_current_stream("video")
-#
-# frames = []  # we are going to save the frames here.
-#
-# # We seek into a second second of the video and use islice to get 10 frames since
-# for frame, pts in itertools.islice(video.seek(2), 10):
-#     frames.append(frame)
-#
-# print("Total number of frames: ", len(frames))
-#
-# ######################################
-# # Or if we wanted to read from 2nd to 5th second,
-# # We seek into a second second of the video,
-# # then we utilize the itertools takewhile to get the
-# # correct number of frames:
-#
-# video.set_current_stream("video")
-# frames = []  # we are going to save the frames here.
-# video = video.seek(2)
-#
-# for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video):
-#     frames.append(frame['data'])
-#
-# print("Total number of frames: ", len(frames))
-# approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0]
-# print("We can expect approx: ", approx_nf)
-# print("Tensor size: ", frames[0].size())
-#
-# ####################################
-# # 2. Building a sample read_video function
-# # ----------------------------------------------------------------------------------------
-# # We can utilize the methods above to build the read video function that follows
-# # the same API to the existing ``read_video`` function.
-#
-#
-# def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True):
-#     if end is None:
-#         end = float("inf")
-#     if end < start:
-#         raise ValueError(
-#             "end time should be larger than start time, got "
-#             f"start time={start} and end time={end}"
-#         )
-#
-#     video_frames = torch.empty(0)
-#     video_pts = []
-#     if read_video:
-#         video_object.set_current_stream("video")
-#         frames = []
-#         for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
-#             frames.append(frame['data'])
-#             video_pts.append(frame['pts'])
-#         if len(frames) > 0:
-#             video_frames = torch.stack(frames, 0)
-#
-#     audio_frames = torch.empty(0)
-#     audio_pts = []
-#     if read_audio:
-#         video_object.set_current_stream("audio")
-#         frames = []
-#         for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
-#             frames.append(frame['data'])
-#             audio_pts.append(frame['pts'])
-#         if len(frames) > 0:
-#             audio_frames = torch.cat(frames, 0)
-#
-#     return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata()
-#
-#
-# # Total number of frames should be 327 for video and 523264 datapoints for audio
-# vf, af, info, meta = example_read_video(video)
-# print(vf.size(), af.size())
-#
-# ####################################
-# # 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400)
-# # -------------------------------------------------------------------------------------------------------
-# # Cool, so now we can use the same principle to make the sample dataset.
-# # We suggest trying out iterable dataset for this purpose.
-# # Here, we are going to build an example dataset that reads randomly selected 10 frames of video.
-#
-# ####################################
-# # Make sample dataset
-# import os
-# os.makedirs("./dataset", exist_ok=True)
-# os.makedirs("./dataset/1", exist_ok=True)
-# os.makedirs("./dataset/2", exist_ok=True)
-#
-# ####################################
-# # Download the videos
-# from torchvision.datasets.utils import download_url
-# download_url(
-#     "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
-#     "./dataset/1", "WUzgd7C1pWA.mp4"
-# )
-# download_url(
-#     "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true",
-#     "./dataset/1",
-#     "RATRACE_wave_f_nm_np1_fr_goo_37.avi"
-# )
-# download_url(
-#     "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true",
-#     "./dataset/2",
-#     "SOX5yA1l24A.mp4"
-# )
-# download_url(
-#     "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true",
-#     "./dataset/2",
-#     "v_SoccerJuggling_g23_c01.avi"
-# )
-# download_url(
-#     "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true",
-#     "./dataset/2",
-#     "v_SoccerJuggling_g24_c01.avi"
-# )
-#
-# ####################################
-# # Housekeeping and utilities
-# import os
-# import random
-#
-# from torchvision.datasets.folder import make_dataset
-# from torchvision import transforms as t
-#
-#
-# def _find_classes(dir):
-#     classes = [d.name for d in os.scandir(dir) if d.is_dir()]
-#     classes.sort()
-#     class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
-#     return classes, class_to_idx
-#
-#
-# def get_samples(root, extensions=(".mp4", ".avi")):
-#     _, class_to_idx = _find_classes(root)
-#     return make_dataset(root, class_to_idx, extensions=extensions)
-#
-# ####################################
-# # We are going to define the dataset and some basic arguments.
-# # We assume the structure of the FolderDataset, and add the following parameters:
-# #
-# # - ``clip_len``: length of a clip in frames
-# # - ``frame_transform``: transform for every frame individually
-# # - ``video_transform``: transform on a video sequence
-# #
-# # .. note::
-# #   We actually add epoch size as using :func:`~torch.utils.data.IterableDataset`
-# #   class allows us to naturally oversample clips or images from each video if needed.
-#
-#
-# class RandomDataset(torch.utils.data.IterableDataset):
-#     def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16):
-#         super(RandomDataset).__init__()
-#
-#         self.samples = get_samples(root)
-#
-#         # Allow for temporal jittering
-#         if epoch_size is None:
-#             epoch_size = len(self.samples)
-#         self.epoch_size = epoch_size
-#
-#         self.clip_len = clip_len
-#         self.frame_transform = frame_transform
-#         self.video_transform = video_transform
-#
-#     def __iter__(self):
-#         for i in range(self.epoch_size):
-#             # Get random sample
-#             path, target = random.choice(self.samples)
-#             # Get video object
-#             vid = torchvision.io.VideoReader(path, "video")
-#             metadata = vid.get_metadata()
-#             video_frames = []  # video frame buffer
-#
-#             # Seek and return frames
-#             max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0])
-#             start = random.uniform(0., max_seek)
-#             for frame in itertools.islice(vid.seek(start), self.clip_len):
-#                 video_frames.append(self.frame_transform(frame['data']))
-#                 current_pts = frame['pts']
-#             # Stack it into a tensor
-#             video = torch.stack(video_frames, 0)
-#             if self.video_transform:
-#                 video = self.video_transform(video)
-#             output = {
-#                 'path': path,
-#                 'video': video,
-#                 'target': target,
-#                 'start': start,
-#                 'end': current_pts}
-#             yield output
-#
-# ####################################
-# # Given a path of videos in a folder structure, i.e:
-# #
-# # - dataset
-# #     - class 1
-# #         - file 0
-# #         - file 1
-# #         - ...
-# #     - class 2
-# #         - file 0
-# #         - file 1
-# #         - ...
-# #     - ...
-# #
-# # We can generate a dataloader and test the dataset.
-#
-#
-# transforms = [t.Resize((112, 112))]
-# frame_transform = t.Compose(transforms)
-#
-# dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform)
-#
-# ####################################
-# from torch.utils.data import DataLoader
-# loader = DataLoader(dataset, batch_size=12)
-# data = {"video": [], 'start': [], 'end': [], 'tensorsize': []}
-# for batch in loader:
-#     for i in range(len(batch['path'])):
-#         data['video'].append(batch['path'][i])
-#         data['start'].append(batch['start'][i].item())
-#         data['end'].append(batch['end'][i].item())
-#         data['tensorsize'].append(batch['video'][i].size())
-# print(data)
-#
-# ####################################
-# # 4. Data Visualization
-# # ----------------------------------
-# # Example of visualized video
-#
-# import matplotlib.pyplot as plt
-#
-# plt.figure(figsize=(12, 12))
-# for i in range(16):
-#     plt.subplot(4, 4, i + 1)
-#     plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0))
-#     plt.axis("off")
-#
-# ####################################
-# # Cleanup the video and dataset:
-# import os
-# import shutil
-# os.remove("./WUzgd7C1pWA.mp4")
-# shutil.rmtree("./dataset")

From c85fef95c4f2b73ca2f9064f3b282046803c156b Mon Sep 17 00:00:00 2001
From: Philip Meier <github.pmeier@posteo.de>
Date: Thu, 3 Nov 2022 22:04:10 +0100
Subject: [PATCH 7/7] revert video api gallery

---
 gallery/plot_video_api.py | 247 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 243 insertions(+), 4 deletions(-)

diff --git a/gallery/plot_video_api.py b/gallery/plot_video_api.py
index 02d37eaafad..d83a508eabe 100644
--- a/gallery/plot_video_api.py
+++ b/gallery/plot_video_api.py
@@ -94,9 +94,248 @@
 #
 # For example, if we wanted to read ten frames from second second:
 
-# FIXME: With https://github.com/pytorch/vision/pull/6598 this blocks leads to sphinx build hanging when using
-#  multiprocessing.
+
+import itertools
+video.set_current_stream("video")
+
+frames = []  # we are going to save the frames here.
+
+# We seek into a second second of the video and use islice to get 10 frames since
+for frame, pts in itertools.islice(video.seek(2), 10):
+    frames.append(frame)
+
+print("Total number of frames: ", len(frames))
+
+######################################
+# Or if we wanted to read from 2nd to 5th second,
+# We seek into a second second of the video,
+# then we utilize the itertools takewhile to get the
+# correct number of frames:
+
 video.set_current_stream("video")
+frames = []  # we are going to save the frames here.
+video = video.seek(2)
+
+for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video):
+    frames.append(frame['data'])
+
+print("Total number of frames: ", len(frames))
+approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0]
+print("We can expect approx: ", approx_nf)
+print("Tensor size: ", frames[0].size())
+
+####################################
+# 2. Building a sample read_video function
+# ----------------------------------------------------------------------------------------
+# We can utilize the methods above to build the read video function that follows
+# the same API to the existing ``read_video`` function.
+
+
+def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True):
+    if end is None:
+        end = float("inf")
+    if end < start:
+        raise ValueError(
+            "end time should be larger than start time, got "
+            f"start time={start} and end time={end}"
+        )
+
+    video_frames = torch.empty(0)
+    video_pts = []
+    if read_video:
+        video_object.set_current_stream("video")
+        frames = []
+        for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
+            frames.append(frame['data'])
+            video_pts.append(frame['pts'])
+        if len(frames) > 0:
+            video_frames = torch.stack(frames, 0)
+
+    audio_frames = torch.empty(0)
+    audio_pts = []
+    if read_audio:
+        video_object.set_current_stream("audio")
+        frames = []
+        for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
+            frames.append(frame['data'])
+            audio_pts.append(frame['pts'])
+        if len(frames) > 0:
+            audio_frames = torch.cat(frames, 0)
+
+    return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata()
+
+
+# Total number of frames should be 327 for video and 523264 datapoints for audio
+vf, af, info, meta = example_read_video(video)
+print(vf.size(), af.size())
+
+####################################
+# 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400)
+# -------------------------------------------------------------------------------------------------------
+# Cool, so now we can use the same principle to make the sample dataset.
+# We suggest trying out iterable dataset for this purpose.
+# Here, we are going to build an example dataset that reads randomly selected 10 frames of video.
+
+####################################
+# Make sample dataset
+import os
+os.makedirs("./dataset", exist_ok=True)
+os.makedirs("./dataset/1", exist_ok=True)
+os.makedirs("./dataset/2", exist_ok=True)
+
+####################################
+# Download the videos
+from torchvision.datasets.utils import download_url
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
+    "./dataset/1", "WUzgd7C1pWA.mp4"
+)
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true",
+    "./dataset/1",
+    "RATRACE_wave_f_nm_np1_fr_goo_37.avi"
+)
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true",
+    "./dataset/2",
+    "SOX5yA1l24A.mp4"
+)
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true",
+    "./dataset/2",
+    "v_SoccerJuggling_g23_c01.avi"
+)
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true",
+    "./dataset/2",
+    "v_SoccerJuggling_g24_c01.avi"
+)
+
+####################################
+# Housekeeping and utilities
+import os
+import random
+
+from torchvision.datasets.folder import make_dataset
+from torchvision import transforms as t
+
+
+def _find_classes(dir):
+    classes = [d.name for d in os.scandir(dir) if d.is_dir()]
+    classes.sort()
+    class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
+    return classes, class_to_idx
+
+
+def get_samples(root, extensions=(".mp4", ".avi")):
+    _, class_to_idx = _find_classes(root)
+    return make_dataset(root, class_to_idx, extensions=extensions)
+
+####################################
+# We are going to define the dataset and some basic arguments.
+# We assume the structure of the FolderDataset, and add the following parameters:
+#
+# - ``clip_len``: length of a clip in frames
+# - ``frame_transform``: transform for every frame individually
+# - ``video_transform``: transform on a video sequence
+#
+# .. note::
+#   We actually add epoch size as using :func:`~torch.utils.data.IterableDataset`
+#   class allows us to naturally oversample clips or images from each video if needed.
+
+
+class RandomDataset(torch.utils.data.IterableDataset):
+    def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16):
+        super(RandomDataset).__init__()
+
+        self.samples = get_samples(root)
 
-for _ in video.seek(2):
-    pass
+        # Allow for temporal jittering
+        if epoch_size is None:
+            epoch_size = len(self.samples)
+        self.epoch_size = epoch_size
+
+        self.clip_len = clip_len
+        self.frame_transform = frame_transform
+        self.video_transform = video_transform
+
+    def __iter__(self):
+        for i in range(self.epoch_size):
+            # Get random sample
+            path, target = random.choice(self.samples)
+            # Get video object
+            vid = torchvision.io.VideoReader(path, "video")
+            metadata = vid.get_metadata()
+            video_frames = []  # video frame buffer
+
+            # Seek and return frames
+            max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0])
+            start = random.uniform(0., max_seek)
+            for frame in itertools.islice(vid.seek(start), self.clip_len):
+                video_frames.append(self.frame_transform(frame['data']))
+                current_pts = frame['pts']
+            # Stack it into a tensor
+            video = torch.stack(video_frames, 0)
+            if self.video_transform:
+                video = self.video_transform(video)
+            output = {
+                'path': path,
+                'video': video,
+                'target': target,
+                'start': start,
+                'end': current_pts}
+            yield output
+
+####################################
+# Given a path of videos in a folder structure, i.e:
+#
+# - dataset
+#     - class 1
+#         - file 0
+#         - file 1
+#         - ...
+#     - class 2
+#         - file 0
+#         - file 1
+#         - ...
+#     - ...
+#
+# We can generate a dataloader and test the dataset.
+
+
+transforms = [t.Resize((112, 112))]
+frame_transform = t.Compose(transforms)
+
+dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform)
+
+####################################
+from torch.utils.data import DataLoader
+loader = DataLoader(dataset, batch_size=12)
+data = {"video": [], 'start': [], 'end': [], 'tensorsize': []}
+for batch in loader:
+    for i in range(len(batch['path'])):
+        data['video'].append(batch['path'][i])
+        data['start'].append(batch['start'][i].item())
+        data['end'].append(batch['end'][i].item())
+        data['tensorsize'].append(batch['video'][i].size())
+print(data)
+
+####################################
+# 4. Data Visualization
+# ----------------------------------
+# Example of visualized video
+
+import matplotlib.pyplot as plt
+
+plt.figure(figsize=(12, 12))
+for i in range(16):
+    plt.subplot(4, 4, i + 1)
+    plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0))
+    plt.axis("off")
+
+####################################
+# Cleanup the video and dataset:
+import os
+import shutil
+os.remove("./WUzgd7C1pWA.mp4")
+shutil.rmtree("./dataset")