Merge branch 'main' into revamp-prototype-features-transforms

pmeier · pmeier · commit 0e7cc3a8e336 · 2022-01-27T08:57:49.000+01:00
diff --git a/.github/process_commit.py b/.github/process_commit.py
@@ -43,6 +43,7 @@
     "module: video",
     "Perf",
     "Revert(ed)",
+    "topic: build",
 }
 
 
diff --git a/test/test_models_detection_negative_samples.py b/test/test_models_detection_negative_samples.py
@@ -143,6 +143,17 @@ def test_forward_negative_sample_retinanet(self):
 
         assert_equal(loss_dict["bbox_regression"], torch.tensor(0.0))
 
+    def test_forward_negative_sample_fcos(self):
+        model = torchvision.models.detection.fcos_resnet50_fpn(
+            num_classes=2, min_size=100, max_size=100, pretrained_backbone=False
+        )
+
+        images, targets = self._make_empty_sample()
+        loss_dict = model(images, targets)
+
+        assert_equal(loss_dict["bbox_regression"], torch.tensor(0.0))
+        assert_equal(loss_dict["bbox_ctrness"], torch.tensor(0.0))
+
     def test_forward_negative_sample_ssd(self):
         model = torchvision.models.detection.ssd300_vgg16(num_classes=2, pretrained_backbone=False)
 
diff --git a/test/test_videoapi.py b/test/test_videoapi.py
@@ -56,37 +56,68 @@ def test_frame_reading(self):
         for test_video, config in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
 
-            av_reader = av.open(full_path)
-
-            if av_reader.streams.video:
-                video_reader = VideoReader(full_path, "video")
-                for av_frame in av_reader.decode(av_reader.streams.video[0]):
-                    vr_frame = next(video_reader)
-
-                    assert float(av_frame.pts * av_frame.time_base) == approx(vr_frame["pts"], abs=0.1)
-
-                    av_array = torch.tensor(av_frame.to_rgb().to_ndarray()).permute(2, 0, 1)
-                    vr_array = vr_frame["data"]
-                    mean_delta = torch.mean(torch.abs(av_array.float() - vr_array.float()))
-                    # on average the difference is very small and caused
-                    # by decoding (around 1%)
-                    # TODO: asses empirically how to set this? atm it's 1%
-                    # averaged over all frames
-                    assert mean_delta.item() < 2.5
-
-            av_reader = av.open(full_path)
-            if av_reader.streams.audio:
-                video_reader = VideoReader(full_path, "audio")
-                for av_frame in av_reader.decode(av_reader.streams.audio[0]):
-                    vr_frame = next(video_reader)
-                    assert float(av_frame.pts * av_frame.time_base) == approx(vr_frame["pts"], abs=0.1)
-
-                    av_array = torch.tensor(av_frame.to_ndarray()).permute(1, 0)
-                    vr_array = vr_frame["data"]
-
-                    max_delta = torch.max(torch.abs(av_array.float() - vr_array.float()))
-                    # we assure that there is never more than 1% difference in signal
-                    assert max_delta.item() < 0.001
+            with av.open(full_path) as av_reader:
+                is_video = True if av_reader.streams.video else False
+
+                if is_video:
+                    av_frames, vr_frames = [], []
+                    av_pts, vr_pts = [], []
+                    # get av frames
+                    for av_frame in av_reader.decode(av_reader.streams.video[0]):
+                        av_frames.append(torch.tensor(av_frame.to_rgb().to_ndarray()).permute(2, 0, 1))
+                        av_pts.append(av_frame.pts * av_frame.time_base)
+
+                    # get vr frames
+                    video_reader = VideoReader(full_path, "video")
+                    for vr_frame in video_reader:
+                        vr_frames.append(vr_frame["data"])
+                        vr_pts.append(vr_frame["pts"])
+
+                    # same number of frames
+                    assert len(vr_frames) == len(av_frames)
+                    assert len(vr_pts) == len(av_pts)
+
+                    # compare the frames and ptss
+                    for i in range(len(vr_frames)):
+                        assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1)
+                        mean_delta = torch.mean(torch.abs(av_frames[i].float() - vr_frames[i].float()))
+                        # on average the difference is very small and caused
+                        # by decoding (around 1%)
+                        # TODO: asses empirically how to set this? atm it's 1%
+                        # averaged over all frames
+                        assert mean_delta.item() < 2.55
+
+                    del vr_frames, av_frames, vr_pts, av_pts
+
+            # test audio reading compared to PYAV
+            with av.open(full_path) as av_reader:
+                is_audio = True if av_reader.streams.audio else False
+
+                if is_audio:
+                    av_frames, vr_frames = [], []
+                    av_pts, vr_pts = [], []
+                    # get av frames
+                    for av_frame in av_reader.decode(av_reader.streams.audio[0]):
+                        av_frames.append(torch.tensor(av_frame.to_ndarray()).permute(1, 0))
+                        av_pts.append(av_frame.pts * av_frame.time_base)
+                    av_reader.close()
+
+                    # get vr frames
+                    video_reader = VideoReader(full_path, "audio")
+                    for vr_frame in video_reader:
+                        vr_frames.append(vr_frame["data"])
+                        vr_pts.append(vr_frame["pts"])
+
+                    # same number of frames
+                    assert len(vr_frames) == len(av_frames)
+                    assert len(vr_pts) == len(av_pts)
+
+                    # compare the frames and ptss
+                    for i in range(len(vr_frames)):
+                        assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1)
+                        max_delta = torch.max(torch.abs(av_frames[i].float() - vr_frames[i].float()))
+                        # we assure that there is never more than 1% difference in signal
+                        assert max_delta.item() < 0.001
 
     def test_metadata(self):
         """
diff --git a/torchvision/models/detection/fcos.py b/torchvision/models/detection/fcos.py
@@ -59,9 +59,13 @@ def compute_loss(
         all_gt_classes_targets = []
         all_gt_boxes_targets = []
         for targets_per_image, matched_idxs_per_image in zip(targets, matched_idxs):
-            gt_classes_targets = targets_per_image["labels"][matched_idxs_per_image.clip(min=0)]
+            if len(targets_per_image["labels"]) == 0:
+                gt_classes_targets = targets_per_image["labels"].new_zeros((len(matched_idxs_per_image),))
+                gt_boxes_targets = targets_per_image["boxes"].new_zeros((len(matched_idxs_per_image), 4))
+            else:
+                gt_classes_targets = targets_per_image["labels"][matched_idxs_per_image.clip(min=0)]
+                gt_boxes_targets = targets_per_image["boxes"][matched_idxs_per_image.clip(min=0)]
             gt_classes_targets[matched_idxs_per_image < 0] = -1  # backgroud
-            gt_boxes_targets = targets_per_image["boxes"][matched_idxs_per_image.clip(min=0)]
             all_gt_classes_targets.append(gt_classes_targets)
             all_gt_boxes_targets.append(gt_boxes_targets)
 
@@ -95,13 +99,14 @@ def compute_loss(
         ]
         bbox_reg_targets = torch.stack(bbox_reg_targets, dim=0)
         if len(bbox_reg_targets) == 0:
-            bbox_reg_targets.new_zeros(len(bbox_reg_targets))
-        left_right = bbox_reg_targets[:, :, [0, 2]]
-        top_bottom = bbox_reg_targets[:, :, [1, 3]]
-        gt_ctrness_targets = torch.sqrt(
-            (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0])
-            * (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
-        )
+            gt_ctrness_targets = bbox_reg_targets.new_zeros(bbox_reg_targets.size()[:-1])
+        else:
+            left_right = bbox_reg_targets[:, :, [0, 2]]
+            top_bottom = bbox_reg_targets[:, :, [1, 3]]
+            gt_ctrness_targets = torch.sqrt(
+                (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0])
+                * (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
+            )
         pred_centerness = bbox_ctrness.squeeze(dim=2)
         loss_bbox_ctrness = nn.functional.binary_cross_entropy_with_logits(
             pred_centerness[foregroud_mask], gt_ctrness_targets[foregroud_mask], reduction="sum"

Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@`
`43`	`43`	`"module: video",`
`44`	`44`	`"Perf",`
`45`	`45`	`"Revert(ed)",`
	`46`	`+ "topic: build",`
`46`	`47`	`}`
`47`	`48`
`48`	`49`