From 9b07c43ed23aaf2f9a0f48ef8b820a155a043daa Mon Sep 17 00:00:00 2001 From: Chandresh Kanani Date: Mon, 9 Sep 2019 12:53:36 +0530 Subject: [PATCH 1/2] modified code of io.video.read_video to interpret start_pts and end_pts in seconds --- torchvision/io/video.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/torchvision/io/video.py b/torchvision/io/video.py index bd25c224ecb..9c9c6700f25 100644 --- a/torchvision/io/video.py +++ b/torchvision/io/video.py @@ -2,6 +2,7 @@ import gc import torch import numpy as np +import math try: import av @@ -184,14 +185,22 @@ def read_video(filename, start_pts=0, end_pts=None): video_frames = [] if container.streams.video: + video_stream = container.streams.video[0] + start_pts = math.floor(start_pts*(1/video_stream.time_base)) + if end_pts != float("inf"): + end_pts = math.ceil(end_pts*(1/video_stream.time_base)) video_frames = _read_from_stream(container, start_pts, end_pts, - container.streams.video[0], {'video': 0}) - info["video_fps"] = float(container.streams.video[0].average_rate) + video_stream, {'video': 0}) + info["video_fps"] = float(video_stream.average_rate) audio_frames = [] if container.streams.audio: + audio_stream = container.streams.audio[0] + start_pts = math.floor(start_pts*(1/audio_stream.time_base)) + if end_pts != float("inf"): + end_pts = math.ceil(end_pts*(1/audio_stream.time_base)) audio_frames = _read_from_stream(container, start_pts, end_pts, - container.streams.audio[0], {'audio': 0}) - info["audio_fps"] = container.streams.audio[0].rate + audio_stream , {'audio': 0}) + info["audio_fps"] = audio_stream.rate container.close() From 85db823f02def0c2ac3a297d233f94019b209ed6 Mon Sep 17 00:00:00 2001 From: Chandresh Kanani Date: Thu, 12 Sep 2019 12:35:35 +0530 Subject: [PATCH 2/2] added pts_unit parameter in read_video and read_video_timestamps methods --- torchvision/io/video.py | 49 ++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/torchvision/io/video.py b/torchvision/io/video.py index 9c9c6700f25..7b340cd9766 100644 --- a/torchvision/io/video.py +++ b/torchvision/io/video.py @@ -3,6 +3,7 @@ import torch import numpy as np import math +import warnings try: import av @@ -146,7 +147,7 @@ def _align_audio_frames(aframes, audio_frames, ref_start, ref_end): return aframes[:, s_idx:e_idx] -def read_video(filename, start_pts=0, end_pts=None): +def read_video(filename, start_pts=0, end_pts=None, pts_unit='sec'): """ Reads a video from a file, returning both the video frames as well as the audio frames @@ -159,6 +160,8 @@ def read_video(filename, start_pts=0, end_pts=None): the start presentation time of the video end_pts : int, optional the end presentation time + pts_unit : str, optional + unit in which start_pts and end_pts values will be interpreted, either 'pts' or 'sec. Defaults to 'sec'. Returns ------- @@ -186,19 +189,31 @@ def read_video(filename, start_pts=0, end_pts=None): video_frames = [] if container.streams.video: video_stream = container.streams.video[0] - start_pts = math.floor(start_pts*(1/video_stream.time_base)) - if end_pts != float("inf"): - end_pts = math.ceil(end_pts*(1/video_stream.time_base)) - video_frames = _read_from_stream(container, start_pts, end_pts, + if pts_unit == 'pts': + warnings.warn("The pts_unit 'pts' produces wrong results and will be removed + in a follow-up version. Please use pts_unit 'sec'.") + video_start_pts = start_pts + video_end_pts = end_pts + else: + video_start_pts = math.floor(start_pts*(1/video_stream.time_base)) + if video_end_pts != float("inf"): + video_end_pts = math.ceil(end_pts*(1/video_stream.time_base)) + video_frames = _read_from_stream(container, video_start_pts, video_end_pts, video_stream, {'video': 0}) info["video_fps"] = float(video_stream.average_rate) audio_frames = [] if container.streams.audio: audio_stream = container.streams.audio[0] - start_pts = math.floor(start_pts*(1/audio_stream.time_base)) - if end_pts != float("inf"): - end_pts = math.ceil(end_pts*(1/audio_stream.time_base)) - audio_frames = _read_from_stream(container, start_pts, end_pts, + if pts_unit == 'pts': + warnings.warn("The pts_unit 'pts' produces wrong results and will be removed + in a follow-up version. Please use pts_unit 'sec'.") + audio_start_pts = start_pts + audio_end_pts = end_pts + else: + audio_start_pts = math.floor(start_pts*(1/audio_stream.time_base)) + if end_pts != float("inf"): + audio_end_pts = math.ceil(end_pts*(1/audio_stream.time_base)) + audio_frames = _read_from_stream(container, audio_start_pts, audio_end_pts, audio_stream , {'audio': 0}) info["audio_fps"] = audio_stream.rate @@ -226,7 +241,7 @@ def _can_read_timestamps_from_packets(container): return False -def read_video_timestamps(filename): +def read_video_timestamps(filename, pts_unit='pts'): """ List the video frames timestamps. @@ -236,6 +251,8 @@ def read_video_timestamps(filename): ---------- filename : str path to the video file + pts_unit : str, optional + unit in which timestamp values will be returned either 'pts' or 'sec. Defaults to 'sec'. Returns ------- @@ -251,12 +268,18 @@ def read_video_timestamps(filename): video_frames = [] video_fps = None if container.streams.video: + video_stream = container.streams.video[0] if _can_read_timestamps_from_packets(container): # fast path video_frames = [x for x in container.demux(video=0) if x.pts is not None] else: video_frames = _read_from_stream(container, 0, float("inf"), - container.streams.video[0], {'video': 0}) - video_fps = float(container.streams.video[0].average_rate) + video_stream, {'video': 0}) + video_fps = float(video_stream.average_rate) container.close() - return [x.pts for x in video_frames], video_fps + if pts_unit == 'pts': + warnings.warn("The pts_unit 'pts' produces wrong results and will be removed in a + follow-up version. Please use pts_unit 'sec'.") + return [x.pts for x in video_frames], video_fps + else: + return [x.pts*video_stream.time_base for x in video_frames], video_fps