Add support of inplace on convert_format_bounding_box

datumbox · datumbox · commit 4383337ecb61 · 2022-10-28T12:14:16.000+01:00
diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
@@ -262,7 +262,7 @@ def _copy_paste(
         # https://github.com/pytorch/vision/blob/b6feccbc4387766b76a3e22b13815dbbbfa87c0f/torchvision/models/detection/roi_heads.py#L418-L422
         xyxy_boxes[:, 2:] += 1
         boxes = F.convert_format_bounding_box(
-            xyxy_boxes, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format
+            xyxy_boxes, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format, inplace=True
         )
         out_target["boxes"] = torch.cat([boxes, paste_boxes])
 
diff --git a/torchvision/prototype/transforms/_geometry.py b/torchvision/prototype/transforms/_geometry.py
@@ -618,7 +618,7 @@ def _check_inputs(self, flat_inputs: List[Any]) -> None:
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
         orig_h, orig_w = query_spatial_size(flat_inputs)
-        bboxes = query_bounding_box(flat_inputs)
+        bboxes = query_bounding_box(flat_inputs).as_subclass(torch.Tensor)
 
         while True:
             # sample an option
@@ -798,6 +798,7 @@ def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
 
         if needs_crop and bounding_boxes is not None:
             format = bounding_boxes.format
+            bounding_boxes = bounding_boxes.as_subclass(torch.Tensor)
             bounding_boxes, spatial_size = F.crop_bounding_box(
                 bounding_boxes, format=format, top=top, left=left, height=new_height, width=new_width
             )
diff --git a/torchvision/prototype/transforms/_misc.py b/torchvision/prototype/transforms/_misc.py
@@ -200,7 +200,7 @@ def __init__(self, min_size: float = 1.0) -> None:
         self.min_size = min_size
 
     def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
-        bounding_box = query_bounding_box(flat_inputs)
+        bounding_box = query_bounding_box(flat_inputs).as_subclass(torch.Tensor)
 
         # TODO: We can improve performance here by not using the `remove_small_boxes` function. It requires the box to
         #  be in XYXY format only to calculate the width and height internally. Thus, if the box is in XYWH or CXCYWH
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
@@ -38,16 +38,14 @@ def horizontal_flip_bounding_box(
 
     # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
     #  BoundingBoxFormat instead of converting back and forth
-    bounding_box = (
-        bounding_box.clone()
-        if format == features.BoundingBoxFormat.XYXY
-        else convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
+    bounding_box = convert_format_bounding_box(
+        bounding_box.clone(), old_format=format, new_format=features.BoundingBoxFormat.XYXY, inplace=True
     ).reshape(-1, 4)
 
     bounding_box[:, [0, 2]] = spatial_size[1] - bounding_box[:, [2, 0]]
 
     return convert_format_bounding_box(
-        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format
+        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(shape)
 
 
@@ -79,16 +77,14 @@ def vertical_flip_bounding_box(
 
     # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
     #  BoundingBoxFormat instead of converting back and forth
-    bounding_box = (
-        bounding_box.clone()
-        if format == features.BoundingBoxFormat.XYXY
-        else convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
+    bounding_box = convert_format_bounding_box(
+        bounding_box.clone(), old_format=format, new_format=features.BoundingBoxFormat.XYXY, inplace=True
     ).reshape(-1, 4)
 
     bounding_box[:, [1, 3]] = spatial_size[0] - bounding_box[:, [3, 1]]
 
     return convert_format_bounding_box(
-        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format
+        bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(shape)
 
 
@@ -412,7 +408,7 @@ def affine_bounding_box(
     # out_bboxes should be of shape [N boxes, 4]
 
     return convert_format_bounding_box(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format
+        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
 
@@ -594,9 +590,9 @@ def rotate_bounding_box(
     )
 
     return (
-        convert_format_bounding_box(out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format).reshape(
-            original_shape
-        ),
+        convert_format_bounding_box(
+            out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+        ).reshape(original_shape),
         spatial_size,
     )
 
@@ -815,18 +811,18 @@ def crop_bounding_box(
 ) -> Tuple[torch.Tensor, Tuple[int, int]]:
     # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
     #  BoundingBoxFormat instead of converting back and forth
-    bounding_box = (
-        bounding_box.clone()
-        if format == features.BoundingBoxFormat.XYXY
-        else convert_format_bounding_box(bounding_box, old_format=format, new_format=features.BoundingBoxFormat.XYXY)
+    bounding_box = convert_format_bounding_box(
+        bounding_box.clone(), old_format=format, new_format=features.BoundingBoxFormat.XYXY, inplace=True
     )
 
     # Crop or implicit pad if left and/or top have negative values:
     bounding_box[..., 0::2] -= left
     bounding_box[..., 1::2] -= top
 
     return (
-        convert_format_bounding_box(bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format),
+        convert_format_bounding_box(
+            bounding_box, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+        ),
         (height, width),
     )
 
@@ -964,7 +960,7 @@ def perspective_bounding_box(
     # out_bboxes should be of shape [N boxes, 4]
 
     return convert_format_bounding_box(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format
+        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
 
@@ -1085,7 +1081,7 @@ def elastic_bounding_box(
     out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_box.dtype)
 
     return convert_format_bounding_box(
-        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format
+        out_bboxes, old_format=features.BoundingBoxFormat.XYXY, new_format=format, inplace=True
     ).reshape(original_shape)
 
 
diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
@@ -119,8 +119,9 @@ def get_num_frames(inpt: features.VideoTypeJIT) -> int:
         raise TypeError(f"The video should be a Tensor. Got {type(inpt)}")
 
 
-def _xywh_to_xyxy(xywh: torch.Tensor) -> torch.Tensor:
-    xyxy = xywh.clone()
+def _xywh_to_xyxy(xywh: torch.Tensor, inplace: bool) -> torch.Tensor:
+    if not inplace:
+        xyxy = xywh.clone()
     xyxy[..., 2:] += xyxy[..., :2]
     return xyxy
 
@@ -150,20 +151,20 @@ def _xyxy_to_cxcywh(xyxy: torch.Tensor) -> torch.Tensor:
 
 
 def convert_format_bounding_box(
-    bounding_box: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat
+    bounding_box: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, inplace: bool = False
 ) -> torch.Tensor:
     if new_format == old_format:
         return bounding_box
 
     if old_format == BoundingBoxFormat.XYWH:
-        bounding_box = _xywh_to_xyxy(bounding_box)
+        bounding_box = _xywh_to_xyxy(bounding_box, inplace)
     elif old_format == BoundingBoxFormat.CXCYWH:
-        bounding_box = _cxcywh_to_xyxy(bounding_box)
+        bounding_box = _cxcywh_to_xyxy(bounding_box, inplace)
 
     if new_format == BoundingBoxFormat.XYWH:
-        bounding_box = _xyxy_to_xywh(bounding_box)
+        bounding_box = _xyxy_to_xywh(bounding_box, inplace)
     elif new_format == BoundingBoxFormat.CXCYWH:
-        bounding_box = _xyxy_to_cxcywh(bounding_box)
+        bounding_box = _xyxy_to_cxcywh(bounding_box, inplace)
 
     return bounding_box
 
@@ -173,14 +174,12 @@ def clamp_bounding_box(
 ) -> torch.Tensor:
     # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
     #  BoundingBoxFormat instead of converting back and forth
-    xyxy_boxes = (
-        bounding_box.clone()
-        if format == BoundingBoxFormat.XYXY
-        else convert_format_bounding_box(bounding_box, format, BoundingBoxFormat.XYXY)
+    xyxy_boxes = convert_format_bounding_box(
+        bounding_box.clone(), old_format=format, new_format=features.BoundingBoxFormat.XYXY, inplace=True
     )
     xyxy_boxes[..., 0::2].clamp_(min=0, max=spatial_size[1])
     xyxy_boxes[..., 1::2].clamp_(min=0, max=spatial_size[0])
-    return convert_format_bounding_box(xyxy_boxes, BoundingBoxFormat.XYXY, format)
+    return convert_format_bounding_box(xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True)
 
 
 def _strip_alpha(image: torch.Tensor) -> torch.Tensor:

Original file line number	Diff line number	Diff line change
`@@ -262,7 +262,7 @@ def _copy_paste(`
`262`	`262`	`# https://github.com/pytorch/vision/blob/b6feccbc4387766b76a3e22b13815dbbbfa87c0f/torchvision/models/detection/roi_heads.py#L418-L422`
`263`	`263`	`xyxy_boxes[:, 2:] += 1`
`264`	`264`	`boxes = F.convert_format_bounding_box(`
`265`		`- xyxy_boxes, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format`
	`265`	`+ xyxy_boxes, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format, inplace=True`
`266`	`266`	`)`
`267`	`267`	`out_target["boxes"] = torch.cat([boxes, paste_boxes])`
`268`	`268`