Skip to content

Commit 53c7e00

Browse files
committed
Register ops to AutocastCPU
* modify the directory structure: moved the autocast files from torchvision/csrc/ops/autocast/ to torchvision/csrc/ops/autocast/cuda * add the cpu directory under the autocast directory; * register deform_conv2d, nms, ps_roi_align, ps_roi_pool, roi_align, and roi_pool to AutocastCPU.
1 parent 719e120 commit 53c7e00

14 files changed

+274
-24
lines changed

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ def get_extensions():
142142
'*.cpp'))
143143
source_cpu = (
144144
glob.glob(os.path.join(extensions_dir, 'ops', 'autograd', '*.cpp')) +
145+
glob.glob(os.path.join(extensions_dir, 'ops', 'autocast', 'cpu', '*.cpp')) +
145146
glob.glob(os.path.join(extensions_dir, 'ops', 'cpu', '*.cpp')) +
146147
glob.glob(os.path.join(extensions_dir, 'ops', 'quantized', 'cpu', '*.cpp'))
147148
)
@@ -170,7 +171,7 @@ def get_extensions():
170171
else:
171172
source_cuda = glob.glob(os.path.join(extensions_dir, 'ops', 'cuda', '*.cu'))
172173

173-
source_cuda += glob.glob(os.path.join(extensions_dir, 'ops', 'autocast', '*.cpp'))
174+
source_cuda += glob.glob(os.path.join(extensions_dir, 'ops', 'autocast', 'cuda', '*.cpp'))
174175

175176
sources = main_file + source_cpu
176177
extension = CppExtension

test/test_ops.py

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import numpy as np
77

88
import torch
9-
from functools import lru_cache
9+
from functools import lru_cache, partial
1010
from torch import Tensor
1111
from torch.autograd import gradcheck
1212
from torch.nn.modules.utils import _pair
@@ -64,12 +64,13 @@ def func(z):
6464
gradcheck(func, (x,))
6565
gradcheck(script_func, (x,))
6666

67-
@needs_cuda
67+
@pytest.mark.parametrize('device', cpu_and_gpu())
6868
@pytest.mark.parametrize('x_dtype', (torch.float, torch.half))
6969
@pytest.mark.parametrize('rois_dtype', (torch.float, torch.half))
70-
def test_autocast(self, x_dtype, rois_dtype):
71-
with torch.cuda.amp.autocast():
72-
self.test_forward(torch.device("cuda"), contiguous=False, x_dtype=x_dtype, rois_dtype=rois_dtype)
70+
def test_autocast(self, device, x_dtype, rois_dtype):
71+
cm = torch.cpu.amp.autocast if device == 'cpu' else torch.cuda.amp.autocast
72+
with cm():
73+
self.test_forward(torch.device(device), contiguous=False, x_dtype=x_dtype, rois_dtype=rois_dtype)
7374

7475
def _helper_boxes_shape(self, func):
7576
# test boxes as Tensor[N, 5]
@@ -269,13 +270,14 @@ def test_forward(self, device, contiguous, aligned, x_dtype=None, rois_dtype=Non
269270
super().test_forward(device=device, contiguous=contiguous, x_dtype=x_dtype, rois_dtype=rois_dtype,
270271
aligned=aligned)
271272

272-
@needs_cuda
273+
@pytest.mark.parametrize('device', cpu_and_gpu())
273274
@pytest.mark.parametrize('aligned', (True, False))
274275
@pytest.mark.parametrize('x_dtype', (torch.float, torch.half))
275276
@pytest.mark.parametrize('rois_dtype', (torch.float, torch.half))
276-
def test_autocast(self, aligned, x_dtype, rois_dtype):
277-
with torch.cuda.amp.autocast():
278-
self.test_forward(torch.device("cuda"), contiguous=False, aligned=aligned, x_dtype=x_dtype,
277+
def test_autocast(self, device, aligned, x_dtype, rois_dtype):
278+
cm = torch.cpu.amp.autocast if device == 'cpu' else torch.cuda.amp.autocast
279+
with cm():
280+
self.test_forward(torch.device(device), contiguous=False, aligned=aligned, x_dtype=x_dtype,
279281
rois_dtype=rois_dtype)
280282

281283
def _make_rois(self, img_size, num_imgs, dtype, num_rois=1000):
@@ -514,12 +516,14 @@ def test_nms_cuda(self, iou, dtype=torch.float64):
514516
is_eq = torch.allclose(scores[r_cpu], scores[r_cuda.cpu()], rtol=tol, atol=tol)
515517
assert is_eq, err_msg.format(iou)
516518

517-
@needs_cuda
519+
@pytest.mark.parametrize('device', cpu_and_gpu())
518520
@pytest.mark.parametrize("iou", (.2, .5, .8))
519521
@pytest.mark.parametrize("dtype", (torch.float, torch.half))
520-
def test_autocast(self, iou, dtype):
521-
with torch.cuda.amp.autocast():
522-
self.test_nms_cuda(iou=iou, dtype=dtype)
522+
def test_autocast(self, device, iou, dtype):
523+
test_fn = self.test_nms_ref if device == 'cpu' else partial(self.test_nms_cuda, dtype=dtype)
524+
cm = torch.cpu.amp.autocast if device == 'cpu' else torch.cuda.amp.autocast
525+
with cm():
526+
test_fn(iou=iou)
523527

524528
@needs_cuda
525529
def test_nms_cuda_float16(self):
@@ -767,12 +771,13 @@ def test_compare_cpu_cuda_grads(self, contiguous):
767771
res_grads = init_weight.grad.to("cpu")
768772
torch.testing.assert_close(true_cpu_grads, res_grads)
769773

770-
@needs_cuda
774+
@pytest.mark.parametrize('device', cpu_and_gpu())
771775
@pytest.mark.parametrize('batch_sz', (0, 33))
772776
@pytest.mark.parametrize('dtype', (torch.float, torch.half))
773-
def test_autocast(self, batch_sz, dtype):
774-
with torch.cuda.amp.autocast():
775-
self.test_forward(torch.device("cuda"), contiguous=False, batch_sz=batch_sz, dtype=dtype)
777+
def test_autocast(self, device, batch_sz, dtype):
778+
cm = torch.cpu.amp.autocast if device == 'cpu' else torch.cuda.amp.autocast
779+
with cm():
780+
self.test_forward(torch.device(device), contiguous=False, batch_sz=batch_sz, dtype=dtype)
776781

777782
def test_forward_scriptability(self):
778783
# Non-regression test for https://github.com/pytorch/vision/issues/4078
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#include "../../deform_conv2d.h"
2+
3+
#include <ATen/autocast_mode.h>
4+
#include <torch/types.h>
5+
6+
namespace vision {
7+
namespace ops {
8+
9+
namespace {
10+
11+
at::Tensor deform_conv2d_autocast(
12+
const at::Tensor& input,
13+
const at::Tensor& weight,
14+
const at::Tensor& offset,
15+
const at::Tensor& mask,
16+
const at::Tensor& bias,
17+
int64_t stride_h,
18+
int64_t stride_w,
19+
int64_t pad_h,
20+
int64_t pad_w,
21+
int64_t dilation_h,
22+
int64_t dilation_w,
23+
int64_t groups,
24+
int64_t offset_groups,
25+
bool use_mask) {
26+
c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
27+
return deform_conv2d(
28+
at::autocast::cached_cast(at::kFloat, input, c10::DeviceType::CPU),
29+
at::autocast::cached_cast(
30+
at::kFloat, weight, c10::DeviceType::CPU),
31+
at::autocast::cached_cast(
32+
at::kFloat, offset, c10::DeviceType::CPU),
33+
at::autocast::cached_cast(at::kFloat, mask, c10::DeviceType::CPU),
34+
at::autocast::cached_cast(at::kFloat, bias, c10::DeviceType::CPU),
35+
stride_h,
36+
stride_w,
37+
pad_h,
38+
pad_w,
39+
dilation_h,
40+
dilation_w,
41+
groups,
42+
offset_groups,
43+
use_mask)
44+
.to(input.scalar_type());
45+
}
46+
47+
} // namespace
48+
49+
TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
50+
m.impl(
51+
TORCH_SELECTIVE_NAME("torchvision::deform_conv2d"),
52+
TORCH_FN(deform_conv2d_autocast));
53+
}
54+
55+
} // namespace ops
56+
} // namespace vision
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#include "../../nms.h"
2+
3+
#include <ATen/autocast_mode.h>
4+
#include <torch/types.h>
5+
6+
namespace vision {
7+
namespace ops {
8+
9+
namespace {
10+
11+
at::Tensor nms_autocast(
12+
const at::Tensor& dets,
13+
const at::Tensor& scores,
14+
double iou_threshold) {
15+
c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
16+
return nms(
17+
at::autocast::cached_cast(at::kFloat, dets, c10::DeviceType::CPU),
18+
at::autocast::cached_cast(at::kFloat, scores, c10::DeviceType::CPU),
19+
iou_threshold);
20+
}
21+
22+
} // namespace
23+
24+
TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
25+
m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(nms_autocast));
26+
}
27+
28+
} // namespace ops
29+
} // namespace vision
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#include "../../ps_roi_align.h"
2+
3+
#include <ATen/autocast_mode.h>
4+
#include <torch/types.h>
5+
6+
namespace vision {
7+
namespace ops {
8+
9+
namespace {
10+
11+
std::tuple<at::Tensor, at::Tensor> ps_roi_align_autocast(
12+
const at::Tensor& input,
13+
const at::Tensor& rois,
14+
double spatial_scale,
15+
int64_t pooled_height,
16+
int64_t pooled_width,
17+
int64_t sampling_ratio) {
18+
c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
19+
auto result = ps_roi_align(
20+
at::autocast::cached_cast(at::kFloat, input, c10::DeviceType::CPU),
21+
at::autocast::cached_cast(at::kFloat, rois, c10::DeviceType::CPU),
22+
spatial_scale,
23+
pooled_height,
24+
pooled_width,
25+
sampling_ratio);
26+
27+
return std::make_tuple(
28+
std::get<0>(result).to(input.scalar_type()),
29+
std::get<1>(result).to(input.scalar_type()));
30+
}
31+
32+
} // namespace
33+
34+
TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
35+
m.impl(
36+
TORCH_SELECTIVE_NAME("torchvision::ps_roi_align"),
37+
TORCH_FN(ps_roi_align_autocast));
38+
}
39+
40+
} // namespace ops
41+
} // namespace vision
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#include "../../ps_roi_pool.h"
2+
3+
#include <ATen/autocast_mode.h>
4+
#include <torch/types.h>
5+
6+
namespace vision {
7+
namespace ops {
8+
9+
namespace {
10+
11+
std::tuple<at::Tensor, at::Tensor> ps_roi_pool_autocast(
12+
const at::Tensor& input,
13+
const at::Tensor& rois,
14+
double spatial_scale,
15+
int64_t pooled_height,
16+
int64_t pooled_width) {
17+
c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
18+
auto result = ps_roi_pool(
19+
at::autocast::cached_cast(at::kFloat, input, c10::DeviceType::CPU),
20+
at::autocast::cached_cast(at::kFloat, rois, c10::DeviceType::CPU),
21+
spatial_scale,
22+
pooled_height,
23+
pooled_width);
24+
25+
return std::make_tuple(
26+
std::get<0>(result).to(input.scalar_type()),
27+
std::get<1>(result).to(input.scalar_type()));
28+
}
29+
30+
} // namespace
31+
32+
TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
33+
m.impl(
34+
TORCH_SELECTIVE_NAME("torchvision::ps_roi_pool"),
35+
TORCH_FN(ps_roi_pool_autocast));
36+
}
37+
38+
} // namespace ops
39+
} // namespace vision
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#include "../../roi_align.h"
2+
3+
#include <ATen/autocast_mode.h>
4+
#include <torch/types.h>
5+
6+
namespace vision {
7+
namespace ops {
8+
9+
namespace {
10+
11+
at::Tensor roi_align_autocast(
12+
const at::Tensor& input,
13+
const at::Tensor& rois,
14+
double spatial_scale,
15+
int64_t pooled_height,
16+
int64_t pooled_width,
17+
int64_t sampling_ratio,
18+
bool aligned) {
19+
c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
20+
return roi_align(
21+
at::autocast::cached_cast(at::kFloat, input, c10::DeviceType::CPU),
22+
at::autocast::cached_cast(at::kFloat, rois, c10::DeviceType::CPU),
23+
spatial_scale,
24+
pooled_height,
25+
pooled_width,
26+
sampling_ratio,
27+
aligned)
28+
.to(input.scalar_type());
29+
}
30+
31+
} // namespace
32+
33+
TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
34+
m.impl(
35+
TORCH_SELECTIVE_NAME("torchvision::roi_align"),
36+
TORCH_FN(roi_align_autocast));
37+
}
38+
39+
} // namespace ops
40+
} // namespace vision
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#include "../../roi_pool.h"
2+
3+
#include <ATen/autocast_mode.h>
4+
#include <torch/types.h>
5+
6+
namespace vision {
7+
namespace ops {
8+
9+
namespace {
10+
11+
std::tuple<at::Tensor, at::Tensor> roi_pool_autocast(
12+
const at::Tensor& input,
13+
const at::Tensor& rois,
14+
double spatial_scale,
15+
int64_t pooled_height,
16+
int64_t pooled_width) {
17+
c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastCPU);
18+
auto result = roi_pool(
19+
at::autocast::cached_cast(at::kFloat, input, c10::DeviceType::CPU),
20+
at::autocast::cached_cast(at::kFloat, rois, c10::DeviceType::CPU),
21+
spatial_scale,
22+
pooled_height,
23+
pooled_width);
24+
25+
return std::make_tuple(
26+
std::get<0>(result).to(input.scalar_type()),
27+
std::get<1>(result).to(input.scalar_type()));
28+
}
29+
30+
} // namespace
31+
32+
TORCH_LIBRARY_IMPL(torchvision, AutocastCPU, m) {
33+
m.impl(
34+
TORCH_SELECTIVE_NAME("torchvision::roi_pool"),
35+
TORCH_FN(roi_pool_autocast));
36+
}
37+
38+
} // namespace ops
39+
} // namespace vision

torchvision/csrc/ops/autocast/deform_conv2d_kernel.cpp renamed to torchvision/csrc/ops/autocast/cuda/deform_conv2d_kernel.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#include "../deform_conv2d.h"
1+
#include "../../deform_conv2d.h"
22

33
#include <ATen/autocast_mode.h>
44
#include <torch/types.h>

torchvision/csrc/ops/autocast/nms_kernel.cpp renamed to torchvision/csrc/ops/autocast/cuda/nms_kernel.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#include "../nms.h"
1+
#include "../../nms.h"
22

33
#include <ATen/autocast_mode.h>
44
#include <torch/types.h>

0 commit comments

Comments
 (0)