Register ops to AutocastCPU

CaoE · CaoE · commit a810d89a9889 · 2021-09-15T14:38:13.000+08:00
* modify the directory structure: moved the autocast files from torchvision/csrc/ops/autocast/ to torchvision/csrc/ops/autocast/cuda

* add the cpu directory under the autocast directory;

* register deform_conv2d, nms, ps_roi_align, ps_roi_pool, roi_align, and roi_pool to AutocastCPU.
diff --git a/setup.py b/setup.py
@@ -142,6 +142,7 @@ def get_extensions():
                                                                                           '*.cpp'))
     source_cpu = (
         glob.glob(os.path.join(extensions_dir, 'ops', 'autograd', '*.cpp')) +
+        glob.glob(os.path.join(extensions_dir, 'ops', 'autocast', 'cpu', '*.cpp')) +
         glob.glob(os.path.join(extensions_dir, 'ops', 'cpu', '*.cpp')) +
         glob.glob(os.path.join(extensions_dir, 'ops', 'quantized', 'cpu', '*.cpp'))
     )
@@ -170,7 +171,7 @@ def get_extensions():
     else:
         source_cuda = glob.glob(os.path.join(extensions_dir, 'ops', 'cuda', '*.cu'))
 
-    source_cuda += glob.glob(os.path.join(extensions_dir, 'ops', 'autocast', '*.cpp'))
+    source_cuda += glob.glob(os.path.join(extensions_dir, 'ops', 'autocast', 'cuda', '*.cpp'))
 
     sources = main_file + source_cpu
     extension = CppExtension
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -71,6 +71,12 @@ def test_autocast(self, x_dtype, rois_dtype):
         with torch.cuda.amp.autocast():
             self.test_forward(torch.device("cuda"), contiguous=False, x_dtype=x_dtype, rois_dtype=rois_dtype)
 
+    @pytest.mark.parametrize('x_dtype', (torch.float, torch.half))
+    @pytest.mark.parametrize('rois_dtype', (torch.float, torch.half))
+    def test_autocast_cpu(self, x_dtype, rois_dtype):
+        with torch.cpu.amp.autocast():
+            self.test_forward(torch.device("cpu"), contiguous=False, x_dtype=x_dtype, rois_dtype=rois_dtype)
+
     def _helper_boxes_shape(self, func):
         # test boxes as Tensor[N, 5]
         with pytest.raises(AssertionError):
@@ -278,6 +284,14 @@ def test_autocast(self, aligned, x_dtype, rois_dtype):
             self.test_forward(torch.device("cuda"), contiguous=False, aligned=aligned, x_dtype=x_dtype,
                               rois_dtype=rois_dtype)
 
+    @pytest.mark.parametrize('aligned', (True, False))
+    @pytest.mark.parametrize('x_dtype', (torch.float, torch.half))
+    @pytest.mark.parametrize('rois_dtype', (torch.float, torch.half))
+    def test_autocast_cpu(self, aligned, x_dtype, rois_dtype):
+        with torch.cpu.amp.autocast():
+            self.test_forward(torch.device("cpu"), contiguous=False, aligned=aligned, x_dtype=x_dtype,
+                              rois_dtype=rois_dtype)
+
     def _make_rois(self, img_size, num_imgs, dtype, num_rois=1000):
         rois = torch.randint(0, img_size // 2, size=(num_rois, 5)).to(dtype)
         rois[:, 0] = torch.randint(0, num_imgs, size=(num_rois,))  # set batch index
@@ -514,13 +528,27 @@ def test_nms_cuda(self, iou, dtype=torch.float64):
             is_eq = torch.allclose(scores[r_cpu], scores[r_cuda.cpu()], rtol=tol, atol=tol)
         assert is_eq, err_msg.format(iou)
 
+
     @needs_cuda
     @pytest.mark.parametrize("iou", (.2, .5, .8))
     @pytest.mark.parametrize("dtype", (torch.float, torch.half))
     def test_autocast(self, iou, dtype):
         with torch.cuda.amp.autocast():
             self.test_nms_cuda(iou=iou, dtype=dtype)
 
+    @pytest.mark.parametrize("iou", (.2, .5, .8))
+    @pytest.mark.parametrize("dtype", (torch.bfloat16,))
+    def test_autocast_cpu(self, iou, dtype):
+        with torch.cpu.amp.autocast():
+            def test_nms_cpu(iou, dtype):
+                boxes, scores = self._create_tensors_with_iou(1000, iou)
+                boxes = boxes.to(dtype=dtype)
+                scores = scores.to(dtype=dtype)
+                out = ops.nms(boxes, scores, iou)
+                outf = ops.nms(boxes.float(), scores.float(), iou)
+                torch.testing.assert_close(out, outf)
+            test_nms_cpu(iou=iou, dtype=dtype)
+
     @needs_cuda
     def test_nms_cuda_float16(self):
         boxes = torch.tensor([[285.3538, 185.5758, 1193.5110, 851.4551],
@@ -774,6 +802,12 @@ def test_autocast(self, batch_sz, dtype):
         with torch.cuda.amp.autocast():
             self.test_forward(torch.device("cuda"), contiguous=False, batch_sz=batch_sz, dtype=dtype)
 
+    @pytest.mark.parametrize('batch_sz', (0, 33))
+    @pytest.mark.parametrize('dtype', (torch.float, torch.half))
+    def test_autocast_cpu(self, batch_sz, dtype):
+        with torch.cpu.amp.autocast():
+            self.test_forward(torch.device("cpu"), contiguous=False, batch_sz=batch_sz, dtype=dtype)
+
     def test_forward_scriptability(self):
         # Non-regression test for https://github.com/pytorch/vision/issues/4078
         torch.jit.script(ops.DeformConv2d(in_channels=8, out_channels=8, kernel_size=3))
diff --git a/torchvision/csrc/ops/autocast/cpu/deform_conv2d_kernel.cpp b/torchvision/csrc/ops/autocast/cpu/deform_conv2d_kernel.cpp
@@ -0,0 +1,54 @@
+#include "../../deform_conv2d.h"
+
+#include <ATen/autocast_mode.h>
+#include <torch/types.h>
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+at::Tensor deform_conv2d_autocast(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& offset,
+    const at::Tensor& mask,
+    const at::Tensor& bias,
+    int64_t stride_h,
+    int64_t stride_w,
+    int64_t pad_h,
+    int64_t pad_w,
+    int64_t dilation_h,
+    int64_t dilation_w,
+    int64_t groups,
+    int64_t offset_groups,
+    bool use_mask) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
+  return deform_conv2d(
+             at::autocast::cached_cast(at::kFloat, input, c10::DeviceType::CPU),
+             at::autocast::cached_cast(at::kFloat, weight, c10::DeviceType::CPU),
+             at::autocast::cached_cast(at::kFloat, offset, c10::DeviceType::CPU),
+             at::autocast::cached_cast(at::kFloat, mask, c10::DeviceType::CPU),
+             at::autocast::cached_cast(at::kFloat, bias, c10::DeviceType::CPU),
+             stride_h,
+             stride_w,
+             pad_h,
+             pad_w,
+             dilation_h,
+             dilation_w,
+             groups,
+             offset_groups,
+             use_mask)
+      .to(input.scalar_type());
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::deform_conv2d"),
+      TORCH_FN(deform_conv2d_autocast));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/autocast/cpu/nms_kernel.cpp b/torchvision/csrc/ops/autocast/cpu/nms_kernel.cpp
@@ -0,0 +1,29 @@
+#include "../../nms.h"
+
+#include <ATen/autocast_mode.h>
+#include <torch/types.h>
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+at::Tensor nms_autocast(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    double iou_threshold) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
+  return nms(
+      at::autocast::cached_cast(at::kFloat, dets, c10::DeviceType::CPU),
+      at::autocast::cached_cast(at::kFloat, scores, c10::DeviceType::CPU),
+      iou_threshold);
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(nms_autocast));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/autocast/cpu/ps_roi_align_kernel.cpp b/torchvision/csrc/ops/autocast/cpu/ps_roi_align_kernel.cpp
@@ -0,0 +1,41 @@
+#include "../../ps_roi_align.h"
+
+#include <ATen/autocast_mode.h>
+#include <torch/types.h>
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+std::tuple<at::Tensor, at::Tensor> ps_roi_align_autocast(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    int64_t pooled_height,
+    int64_t pooled_width,
+    int64_t sampling_ratio) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
+  auto result = ps_roi_align(
+      at::autocast::cached_cast(at::kFloat, input, c10::DeviceType::CPU),
+      at::autocast::cached_cast(at::kFloat, rois, c10::DeviceType::CPU),
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      sampling_ratio);
+
+  return std::make_tuple(
+      std::get<0>(result).to(input.scalar_type()),
+      std::get<1>(result).to(input.scalar_type()));
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::ps_roi_align"),
+      TORCH_FN(ps_roi_align_autocast));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/autocast/cpu/ps_roi_pool_kernel.cpp b/torchvision/csrc/ops/autocast/cpu/ps_roi_pool_kernel.cpp
@@ -0,0 +1,39 @@
+#include "../../ps_roi_pool.h"
+
+#include <ATen/autocast_mode.h>
+#include <torch/types.h>
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+std::tuple<at::Tensor, at::Tensor> ps_roi_pool_autocast(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    int64_t pooled_height,
+    int64_t pooled_width) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
+  auto result = ps_roi_pool(
+      at::autocast::cached_cast(at::kFloat, input, c10::DeviceType::CPU),
+      at::autocast::cached_cast(at::kFloat, rois, c10::DeviceType::CPU),
+      spatial_scale,
+      pooled_height,
+      pooled_width);
+
+  return std::make_tuple(
+      std::get<0>(result).to(input.scalar_type()),
+      std::get<1>(result).to(input.scalar_type()));
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::ps_roi_pool"),
+      TORCH_FN(ps_roi_pool_autocast));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/autocast/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/autocast/cpu/roi_align_kernel.cpp
@@ -0,0 +1,40 @@
+#include "../../roi_align.h"
+
+#include <ATen/autocast_mode.h>
+#include <torch/types.h>
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+at::Tensor roi_align_autocast(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    int64_t pooled_height,
+    int64_t pooled_width,
+    int64_t sampling_ratio,
+    bool aligned) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
+  return roi_align(
+             at::autocast::cached_cast(at::kFloat, input, c10::DeviceType::CPU),
+             at::autocast::cached_cast(at::kFloat, rois, c10::DeviceType::CPU),
+             spatial_scale,
+             pooled_height,
+             pooled_width,
+             sampling_ratio,
+             aligned)
+      .to(input.scalar_type());
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::roi_align"),
+      TORCH_FN(roi_align_autocast));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/autocast/cpu/roi_pool_kernel.cpp b/torchvision/csrc/ops/autocast/cpu/roi_pool_kernel.cpp
@@ -0,0 +1,39 @@
+#include "../../roi_pool.h"
+
+#include <ATen/autocast_mode.h>
+#include <torch/types.h>
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+std::tuple<at::Tensor, at::Tensor> roi_pool_autocast(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    int64_t pooled_height,
+    int64_t pooled_width) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
+  auto result = roi_pool(
+      at::autocast::cached_cast(at::kFloat, input, c10::DeviceType::CPU),
+      at::autocast::cached_cast(at::kFloat, rois, c10::DeviceType::CPU),
+      spatial_scale,
+      pooled_height,
+      pooled_width);
+
+  return std::make_tuple(
+      std::get<0>(result).to(input.scalar_type()),
+      std::get<1>(result).to(input.scalar_type()));
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::roi_pool"),
+      TORCH_FN(roi_pool_autocast));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/autocast/cuda/deform_conv2d_kernel.cpp b/torchvision/csrc/ops/autocast/cuda/deform_conv2d_kernel.cpp
@@ -1,4 +1,4 @@
-#include "../deform_conv2d.h"
+#include "../../deform_conv2d.h"
 
 #include <ATen/autocast_mode.h>
 #include <torch/types.h>
diff --git a/torchvision/csrc/ops/autocast/cuda/nms_kernel.cpp b/torchvision/csrc/ops/autocast/cuda/nms_kernel.cpp
@@ -1,4 +1,4 @@
-#include "../nms.h"
+#include "../../nms.h"
 
 #include <ATen/autocast_mode.h>
 #include <torch/types.h>
diff --git a/torchvision/csrc/ops/autocast/cuda/ps_roi_align_kernel.cpp b/torchvision/csrc/ops/autocast/cuda/ps_roi_align_kernel.cpp
@@ -1,4 +1,4 @@
-#include "../ps_roi_align.h"
+#include "../../ps_roi_align.h"
 
 #include <ATen/autocast_mode.h>
 #include <torch/types.h>
diff --git a/torchvision/csrc/ops/autocast/cuda/ps_roi_pool_kernel.cpp b/torchvision/csrc/ops/autocast/cuda/ps_roi_pool_kernel.cpp
@@ -1,4 +1,4 @@
-#include "../ps_roi_pool.h"
+#include "../../ps_roi_pool.h"
 
 #include <ATen/autocast_mode.h>
 #include <torch/types.h>
diff --git a/torchvision/csrc/ops/autocast/cuda/roi_align_kernel.cpp b/torchvision/csrc/ops/autocast/cuda/roi_align_kernel.cpp
@@ -1,4 +1,4 @@
-#include "../roi_align.h"
+#include "../../roi_align.h"
 
 #include <ATen/autocast_mode.h>
 #include <torch/types.h>
diff --git a/torchvision/csrc/ops/autocast/cuda/roi_pool_kernel.cpp b/torchvision/csrc/ops/autocast/cuda/roi_pool_kernel.cpp
@@ -1,4 +1,4 @@
-#include "../roi_pool.h"
+#include "../../roi_pool.h"
 
 #include <ATen/autocast_mode.h>
 #include <torch/types.h>

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-#include "../deform_conv2d.h"`
	`1`	`+#include "../../deform_conv2d.h"`
`2`	`2`
`3`	`3`	`#include <ATen/autocast_mode.h>`
`4`	`4`	`#include <torch/types.h>`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-#include "../nms.h"`
	`1`	`+#include "../../nms.h"`
`2`	`2`
`3`	`3`	`#include <ATen/autocast_mode.h>`
`4`	`4`	`#include <torch/types.h>`