Register ops to AutocastCPU

CaoE · CaoE · commit 53c7e0097bf2 · 2021-09-17T10:21:46.000+08:00
* modify the directory structure: moved the autocast files from torchvision/csrc/ops/autocast/ to torchvision/csrc/ops/autocast/cuda

* add the cpu directory under the autocast directory;

* register deform_conv2d, nms, ps_roi_align, ps_roi_pool, roi_align, and roi_pool to AutocastCPU.
diff --git a/setup.py b/setup.py
@@ -142,6 +142,7 @@ def get_extensions():
                                                                                           '*.cpp'))
     source_cpu = (
         glob.glob(os.path.join(extensions_dir, 'ops', 'autograd', '*.cpp')) +
+        glob.glob(os.path.join(extensions_dir, 'ops', 'autocast', 'cpu', '*.cpp')) +
         glob.glob(os.path.join(extensions_dir, 'ops', 'cpu', '*.cpp')) +
         glob.glob(os.path.join(extensions_dir, 'ops', 'quantized', 'cpu', '*.cpp'))
     )
@@ -170,7 +171,7 @@ def get_extensions():
     else:
         source_cuda = glob.glob(os.path.join(extensions_dir, 'ops', 'cuda', '*.cu'))
 
-    source_cuda += glob.glob(os.path.join(extensions_dir, 'ops', 'autocast', '*.cpp'))
+    source_cuda += glob.glob(os.path.join(extensions_dir, 'ops', 'autocast', 'cuda', '*.cpp'))
 
     sources = main_file + source_cpu
     extension = CppExtension
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -6,7 +6,7 @@
 import numpy as np
 
 import torch
-from functools import lru_cache
+from functools import lru_cache, partial
 from torch import Tensor
 from torch.autograd import gradcheck
 from torch.nn.modules.utils import _pair
@@ -64,12 +64,13 @@ def func(z):
         gradcheck(func, (x,))
         gradcheck(script_func, (x,))
 
-    @needs_cuda
+    @pytest.mark.parametrize('device', cpu_and_gpu())
     @pytest.mark.parametrize('x_dtype', (torch.float, torch.half))
     @pytest.mark.parametrize('rois_dtype', (torch.float, torch.half))
-    def test_autocast(self, x_dtype, rois_dtype):
-        with torch.cuda.amp.autocast():
-            self.test_forward(torch.device("cuda"), contiguous=False, x_dtype=x_dtype, rois_dtype=rois_dtype)
+    def test_autocast(self, device, x_dtype, rois_dtype):
+        cm = torch.cpu.amp.autocast if device == 'cpu' else torch.cuda.amp.autocast
+        with cm():
+            self.test_forward(torch.device(device), contiguous=False, x_dtype=x_dtype, rois_dtype=rois_dtype)
 
     def _helper_boxes_shape(self, func):
         # test boxes as Tensor[N, 5]
@@ -269,13 +270,14 @@ def test_forward(self, device, contiguous, aligned, x_dtype=None, rois_dtype=Non
         super().test_forward(device=device, contiguous=contiguous, x_dtype=x_dtype, rois_dtype=rois_dtype,
                              aligned=aligned)
 
-    @needs_cuda
+    @pytest.mark.parametrize('device', cpu_and_gpu())
     @pytest.mark.parametrize('aligned', (True, False))
     @pytest.mark.parametrize('x_dtype', (torch.float, torch.half))
     @pytest.mark.parametrize('rois_dtype', (torch.float, torch.half))
-    def test_autocast(self, aligned, x_dtype, rois_dtype):
-        with torch.cuda.amp.autocast():
-            self.test_forward(torch.device("cuda"), contiguous=False, aligned=aligned, x_dtype=x_dtype,
+    def test_autocast(self, device, aligned, x_dtype, rois_dtype):
+        cm = torch.cpu.amp.autocast if device == 'cpu' else torch.cuda.amp.autocast
+        with cm():
+            self.test_forward(torch.device(device), contiguous=False, aligned=aligned, x_dtype=x_dtype,
                               rois_dtype=rois_dtype)
 
     def _make_rois(self, img_size, num_imgs, dtype, num_rois=1000):
@@ -514,12 +516,14 @@ def test_nms_cuda(self, iou, dtype=torch.float64):
             is_eq = torch.allclose(scores[r_cpu], scores[r_cuda.cpu()], rtol=tol, atol=tol)
         assert is_eq, err_msg.format(iou)
 
-    @needs_cuda
+    @pytest.mark.parametrize('device', cpu_and_gpu())
     @pytest.mark.parametrize("iou", (.2, .5, .8))
     @pytest.mark.parametrize("dtype", (torch.float, torch.half))
-    def test_autocast(self, iou, dtype):
-        with torch.cuda.amp.autocast():
-            self.test_nms_cuda(iou=iou, dtype=dtype)
+    def test_autocast(self, device, iou, dtype):
+        test_fn = self.test_nms_ref if device == 'cpu' else partial(self.test_nms_cuda, dtype=dtype)
+        cm = torch.cpu.amp.autocast if device == 'cpu' else torch.cuda.amp.autocast
+        with cm():
+            test_fn(iou=iou)
 
     @needs_cuda
     def test_nms_cuda_float16(self):
@@ -767,12 +771,13 @@ def test_compare_cpu_cuda_grads(self, contiguous):
                 res_grads = init_weight.grad.to("cpu")
                 torch.testing.assert_close(true_cpu_grads, res_grads)
 
-    @needs_cuda
+    @pytest.mark.parametrize('device', cpu_and_gpu())
     @pytest.mark.parametrize('batch_sz', (0, 33))
     @pytest.mark.parametrize('dtype', (torch.float, torch.half))
-    def test_autocast(self, batch_sz, dtype):
-        with torch.cuda.amp.autocast():
-            self.test_forward(torch.device("cuda"), contiguous=False, batch_sz=batch_sz, dtype=dtype)
+    def test_autocast(self, device, batch_sz, dtype):
+        cm = torch.cpu.amp.autocast if device == 'cpu' else torch.cuda.amp.autocast
+        with cm():
+            self.test_forward(torch.device(device), contiguous=False, batch_sz=batch_sz, dtype=dtype)
 
     def test_forward_scriptability(self):
         # Non-regression test for https://github.com/pytorch/vision/issues/4078
diff --git a/torchvision/csrc/ops/autocast/cpu/deform_conv2d_kernel.cpp b/torchvision/csrc/ops/autocast/cpu/deform_conv2d_kernel.cpp
@@ -0,0 +1,56 @@
+#include "../../deform_conv2d.h"
+
+#include <ATen/autocast_mode.h>
+#include <torch/types.h>
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+at::Tensor deform_conv2d_autocast(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& offset,
+    const at::Tensor& mask,
+    const at::Tensor& bias,
+    int64_t stride_h,
+    int64_t stride_w,
+    int64_t pad_h,
+    int64_t pad_w,
+    int64_t dilation_h,
+    int64_t dilation_w,
+    int64_t groups,
+    int64_t offset_groups,
+    bool use_mask) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
+  return deform_conv2d(
+             at::autocast::cached_cast(at::kFloat, input, c10::DeviceType::CPU),
+             at::autocast::cached_cast(
+                 at::kFloat, weight, c10::DeviceType::CPU),
+             at::autocast::cached_cast(
+                 at::kFloat, offset, c10::DeviceType::CPU),
+             at::autocast::cached_cast(at::kFloat, mask, c10::DeviceType::CPU),
+             at::autocast::cached_cast(at::kFloat, bias, c10::DeviceType::CPU),
+             stride_h,
+             stride_w,
+             pad_h,
+             pad_w,
+             dilation_h,
+             dilation_w,
+             groups,
+             offset_groups,
+             use_mask)
+      .to(input.scalar_type());
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::deform_conv2d"),
+      TORCH_FN(deform_conv2d_autocast));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/autocast/cpu/nms_kernel.cpp b/torchvision/csrc/ops/autocast/cpu/nms_kernel.cpp
@@ -0,0 +1,29 @@
+#include "../../nms.h"
+
+#include <ATen/autocast_mode.h>
+#include <torch/types.h>
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+at::Tensor nms_autocast(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    double iou_threshold) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
+  return nms(
+      at::autocast::cached_cast(at::kFloat, dets, c10::DeviceType::CPU),
+      at::autocast::cached_cast(at::kFloat, scores, c10::DeviceType::CPU),
+      iou_threshold);
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(nms_autocast));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/autocast/cpu/ps_roi_align_kernel.cpp b/torchvision/csrc/ops/autocast/cpu/ps_roi_align_kernel.cpp
@@ -0,0 +1,41 @@
+#include "../../ps_roi_align.h"
+
+#include <ATen/autocast_mode.h>
+#include <torch/types.h>
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+std::tuple<at::Tensor, at::Tensor> ps_roi_align_autocast(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    int64_t pooled_height,
+    int64_t pooled_width,
+    int64_t sampling_ratio) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
+  auto result = ps_roi_align(
+      at::autocast::cached_cast(at::kFloat, input, c10::DeviceType::CPU),
+      at::autocast::cached_cast(at::kFloat, rois, c10::DeviceType::CPU),
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      sampling_ratio);
+
+  return std::make_tuple(
+      std::get<0>(result).to(input.scalar_type()),
+      std::get<1>(result).to(input.scalar_type()));
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::ps_roi_align"),
+      TORCH_FN(ps_roi_align_autocast));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/autocast/cpu/ps_roi_pool_kernel.cpp b/torchvision/csrc/ops/autocast/cpu/ps_roi_pool_kernel.cpp
@@ -0,0 +1,39 @@
+#include "../../ps_roi_pool.h"
+
+#include <ATen/autocast_mode.h>
+#include <torch/types.h>
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+std::tuple<at::Tensor, at::Tensor> ps_roi_pool_autocast(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    int64_t pooled_height,
+    int64_t pooled_width) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
+  auto result = ps_roi_pool(
+      at::autocast::cached_cast(at::kFloat, input, c10::DeviceType::CPU),
+      at::autocast::cached_cast(at::kFloat, rois, c10::DeviceType::CPU),
+      spatial_scale,
+      pooled_height,
+      pooled_width);
+
+  return std::make_tuple(
+      std::get<0>(result).to(input.scalar_type()),
+      std::get<1>(result).to(input.scalar_type()));
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::ps_roi_pool"),
+      TORCH_FN(ps_roi_pool_autocast));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/autocast/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/autocast/cpu/roi_align_kernel.cpp
@@ -0,0 +1,40 @@
+#include "../../roi_align.h"
+
+#include <ATen/autocast_mode.h>
+#include <torch/types.h>
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+at::Tensor roi_align_autocast(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    int64_t pooled_height,
+    int64_t pooled_width,
+    int64_t sampling_ratio,
+    bool aligned) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
+  return roi_align(
+             at::autocast::cached_cast(at::kFloat, input, c10::DeviceType::CPU),
+             at::autocast::cached_cast(at::kFloat, rois, c10::DeviceType::CPU),
+             spatial_scale,
+             pooled_height,
+             pooled_width,
+             sampling_ratio,
+             aligned)
+      .to(input.scalar_type());
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::roi_align"),
+      TORCH_FN(roi_align_autocast));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/autocast/cpu/roi_pool_kernel.cpp b/torchvision/csrc/ops/autocast/cpu/roi_pool_kernel.cpp
@@ -0,0 +1,39 @@
+#include "../../roi_pool.h"
+
+#include <ATen/autocast_mode.h>
+#include <torch/types.h>
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+std::tuple<at::Tensor, at::Tensor> roi_pool_autocast(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    int64_t pooled_height,
+    int64_t pooled_width) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
+  auto result = roi_pool(
+      at::autocast::cached_cast(at::kFloat, input, c10::DeviceType::CPU),
+      at::autocast::cached_cast(at::kFloat, rois, c10::DeviceType::CPU),
+      spatial_scale,
+      pooled_height,
+      pooled_width);
+
+  return std::make_tuple(
+      std::get<0>(result).to(input.scalar_type()),
+      std::get<1>(result).to(input.scalar_type()));
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::roi_pool"),
+      TORCH_FN(roi_pool_autocast));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/autocast/cuda/deform_conv2d_kernel.cpp b/torchvision/csrc/ops/autocast/cuda/deform_conv2d_kernel.cpp
@@ -1,4 +1,4 @@
-#include "../deform_conv2d.h"
+#include "../../deform_conv2d.h"
 
 #include <ATen/autocast_mode.h>
 #include <torch/types.h>
diff --git a/torchvision/csrc/ops/autocast/cuda/nms_kernel.cpp b/torchvision/csrc/ops/autocast/cuda/nms_kernel.cpp
@@ -1,4 +1,4 @@
-#include "../nms.h"
+#include "../../nms.h"
 
 #include <ATen/autocast_mode.h>
 #include <torch/types.h>
diff --git a/torchvision/csrc/ops/autocast/cuda/ps_roi_align_kernel.cpp b/torchvision/csrc/ops/autocast/cuda/ps_roi_align_kernel.cpp
@@ -1,4 +1,4 @@
-#include "../ps_roi_align.h"
+#include "../../ps_roi_align.h"
 
 #include <ATen/autocast_mode.h>
 #include <torch/types.h>
diff --git a/torchvision/csrc/ops/autocast/cuda/ps_roi_pool_kernel.cpp b/torchvision/csrc/ops/autocast/cuda/ps_roi_pool_kernel.cpp
@@ -1,4 +1,4 @@
-#include "../ps_roi_pool.h"
+#include "../../ps_roi_pool.h"
 
 #include <ATen/autocast_mode.h>
 #include <torch/types.h>
diff --git a/torchvision/csrc/ops/autocast/cuda/roi_align_kernel.cpp b/torchvision/csrc/ops/autocast/cuda/roi_align_kernel.cpp
@@ -1,4 +1,4 @@
-#include "../roi_align.h"
+#include "../../roi_align.h"
 
 #include <ATen/autocast_mode.h>
 #include <torch/types.h>
diff --git a/torchvision/csrc/ops/autocast/cuda/roi_pool_kernel.cpp b/torchvision/csrc/ops/autocast/cuda/roi_pool_kernel.cpp
@@ -1,4 +1,4 @@
-#include "../roi_pool.h"
+#include "../../roi_pool.h"
 
 #include <ATen/autocast_mode.h>
 #include <torch/types.h>

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-#include "../deform_conv2d.h"`
	`1`	`+#include "../../deform_conv2d.h"`
`2`	`2`
`3`	`3`	`#include <ATen/autocast_mode.h>`
`4`	`4`	`#include <torch/types.h>`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-#include "../nms.h"`
	`1`	`+#include "../../nms.h"`
`2`	`2`
`3`	`3`	`#include <ATen/autocast_mode.h>`
`4`	`4`	`#include <torch/types.h>`