diff --git a/test/test_videoapi.py b/test/test_videoapi.py index 895b9b83555..4688e5a640b 100644 --- a/test/test_videoapi.py +++ b/test/test_videoapi.py @@ -77,6 +77,7 @@ def test_frame_reading(self, test_video): # compare the frames and ptss for i in range(len(vr_frames)): assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1) + mean_delta = torch.mean(torch.abs(av_frames[i].float() - vr_frames[i].float())) # on average the difference is very small and caused # by decoding (around 1%) @@ -114,6 +115,46 @@ def test_frame_reading(self, test_video): # we assure that there is never more than 1% difference in signal assert max_delta.item() < 0.001 + @pytest.mark.parametrize("stream", ["video", "audio"]) + @pytest.mark.parametrize("test_video", test_videos.keys()) + def test_frame_reading_mem_vs_file(self, test_video, stream): + full_path = os.path.join(VIDEO_DIR, test_video) + + # Test video reading from file vs from memory + vr_frames, vr_frames_mem = [], [] + vr_pts, vr_pts_mem = [], [] + # get vr frames + video_reader = VideoReader(full_path, stream) + for vr_frame in video_reader: + vr_frames.append(vr_frame["data"]) + vr_pts.append(vr_frame["pts"]) + + # get vr frames = read from memory + f = open(full_path, "rb") + fbytes = f.read() + f.close() + video_reader_from_mem = VideoReader(fbytes, stream) + + for vr_frame_from_mem in video_reader_from_mem: + vr_frames_mem.append(vr_frame_from_mem["data"]) + vr_pts_mem.append(vr_frame_from_mem["pts"]) + + # same number of frames + assert len(vr_frames) == len(vr_frames_mem) + assert len(vr_pts) == len(vr_pts_mem) + + # compare the frames and ptss + for i in range(len(vr_frames)): + assert vr_pts[i] == vr_pts_mem[i] + mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float())) + # on average the difference is very small and caused + # by decoding (around 1%) + # TODO: asses empirically how to set this? atm it's 1% + # averaged over all frames + assert mean_delta.item() < 2.55 + + del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem + @pytest.mark.parametrize("test_video,config", test_videos.items()) def test_metadata(self, test_video, config): """ diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h index dac6293d366..502e5762e46 100644 --- a/torchvision/csrc/io/decoder/defs.h +++ b/torchvision/csrc/io/decoder/defs.h @@ -165,7 +165,7 @@ struct MediaFormat { struct DecoderParameters { // local file, remote file, http url, rtmp stream uri, etc. anything that // ffmpeg can recognize - std::string uri; + std::string uri{std::string()}; // timeout on getting bytes for decoding size_t timeoutMs{1000}; // logging level, default AV_LOG_PANIC diff --git a/torchvision/csrc/io/video/video.cpp b/torchvision/csrc/io/video/video.cpp index 38b35014595..d8b36a35adc 100644 --- a/torchvision/csrc/io/video/video.cpp +++ b/torchvision/csrc/io/video/video.cpp @@ -156,14 +156,34 @@ void Video::_getDecoderParams( } // _get decoder params -Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { - C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video"); +void Video::initFromFile( + std::string videoPath, + std::string stream, + int64_t numThreads) { + TORCH_CHECK(!initialized, "Video object can only be initialized once"); + initialized = true; + params.uri = videoPath; + _init(stream, numThreads); +} + +void Video::initFromMemory( + torch::Tensor videoTensor, + std::string stream, + int64_t numThreads) { + TORCH_CHECK(!initialized, "Video object can only be initialized once"); + initialized = true; + callback = MemoryBuffer::getCallback( + videoTensor.data_ptr(), videoTensor.size(0)); + _init(stream, numThreads); +} + +void Video::_init(std::string stream, int64_t numThreads) { // set number of threads global numThreads_ = numThreads; // parse stream information current_stream = _parseStream(stream); // note that in the initial call we want to get all streams - Video::_getDecoderParams( + _getDecoderParams( 0, // video start 0, // headerOnly std::get<0>(current_stream), // stream info - remove that @@ -175,11 +195,6 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { std::string logMessage, logType; - // TODO: add read from memory option - params.uri = videoPath; - logType = "file"; - logMessage = videoPath; - // locals std::vector audioFPS, videoFPS; std::vector audioDuration, videoDuration, ccDuration, subsDuration; @@ -190,7 +205,8 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { c10::Dict> subsMetadata; // callback and metadata defined in struct - succeeded = decoder.init(params, std::move(callback), &metadata); + DecoderInCallback tmp_callback = callback; + succeeded = decoder.init(params, std::move(tmp_callback), &metadata); if (succeeded) { for (const auto& header : metadata) { double fps = double(header.fps); @@ -225,16 +241,24 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { streamsMetadata.insert("subtitles", subsMetadata); streamsMetadata.insert("cc", ccMetadata); - succeeded = Video::setCurrentStream(stream); + succeeded = setCurrentStream(stream); LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n"; if (std::get<1>(current_stream) != -1) { LOG(INFO) << "Stream index set to " << std::get<1>(current_stream) << ". If you encounter trouble, consider switching it to automatic stream discovery. \n"; } +} + +Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { + C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video"); + if (!videoPath.empty()) { + initFromFile(videoPath, stream, numThreads); + } } // video bool Video::setCurrentStream(std::string stream = "video") { + TORCH_CHECK(initialized, "Video object has to be initialized first"); if ((!stream.empty()) && (_parseStream(stream) != current_stream)) { current_stream = _parseStream(stream); } @@ -256,19 +280,23 @@ bool Video::setCurrentStream(std::string stream = "video") { ); // callback and metadata defined in Video.h - return (decoder.init(params, std::move(callback), &metadata)); + DecoderInCallback tmp_callback = callback; + return (decoder.init(params, std::move(tmp_callback), &metadata)); } std::tuple Video::getCurrentStream() const { + TORCH_CHECK(initialized, "Video object has to be initialized first"); return current_stream; } c10::Dict>> Video:: getStreamMetadata() const { + TORCH_CHECK(initialized, "Video object has to be initialized first"); return streamsMetadata; } void Video::Seek(double ts, bool fastSeek = false) { + TORCH_CHECK(initialized, "Video object has to be initialized first"); // initialize the class variables used for seeking and retrurn _getDecoderParams( ts, // video start @@ -282,11 +310,14 @@ void Video::Seek(double ts, bool fastSeek = false) { ); // callback and metadata defined in Video.h - succeeded = decoder.init(params, std::move(callback), &metadata); + DecoderInCallback tmp_callback = callback; + succeeded = decoder.init(params, std::move(tmp_callback), &metadata); + LOG(INFO) << "Decoder init at seek " << succeeded << "\n"; } std::tuple Video::Next() { + TORCH_CHECK(initialized, "Video object has to be initialized first"); // if failing to decode simply return a null tensor (note, should we // raise an exeption?) double frame_pts_s; @@ -345,6 +376,8 @@ std::tuple Video::Next() { static auto registerVideo = torch::class_