|
80 | 80 |
|
81 | 81 | using namespace at; |
82 | 82 |
|
83 | | -const unsigned int CUDA_NUM_THREADS = 1024; |
84 | 83 | const int kMaxParallelImgs = 32; |
85 | 84 |
|
86 | | -inline unsigned int GET_BLOCKS(const unsigned int N) { |
| 85 | +inline unsigned int GET_THREADS() { |
| 86 | + if (at::cuda::getCurrentDeviceProperties()->major >= 6) { |
| 87 | + return 1024; |
| 88 | + } |
| 89 | + return 512; |
| 90 | +} |
| 91 | + |
| 92 | +inline unsigned int GET_BLOCKS(const unsigned int THREADS, const unsigned int N) { |
87 | 93 | unsigned int kMaxGridNum = |
88 | 94 | at::cuda::getCurrentDeviceProperties()->maxGridSize[0]; |
89 | | - return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS); |
| 95 | + return std::min(kMaxGridNum, (N + THREADS - 1) / THREADS); |
90 | 96 | } |
91 | 97 |
|
92 | 98 | template <typename scalar_t> |
@@ -224,11 +230,14 @@ static void deformable_im2col( |
224 | 230 | at::Tensor data_col) { |
225 | 231 | int num_kernels = n_in_channels * out_h * out_w * parallel_imgs; |
226 | 232 |
|
| 233 | + const unsigned int threads = GET_THREADS(); |
| 234 | + const unsigned int blocks = GET_BLOCKS(threads, num_kernels); |
| 235 | + |
227 | 236 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( |
228 | 237 | input.scalar_type(), "deformable_im2col_gpu", ([&] { |
229 | 238 | deformable_im2col_gpu_kernel<<< |
230 | | - GET_BLOCKS(num_kernels), |
231 | | - CUDA_NUM_THREADS>>>( |
| 239 | + blocks, |
| 240 | + threads>>>( |
232 | 241 | num_kernels, |
233 | 242 | input.data_ptr<scalar_t>(), |
234 | 243 | data_offset.data_ptr<scalar_t>(), |
@@ -585,11 +594,14 @@ static void compute_grad_input( |
585 | 594 | int num_kernels = |
586 | 595 | channels * weight_h * weight_w * out_h * out_w * parallel_imgs; |
587 | 596 |
|
| 597 | + const unsigned int threads = GET_THREADS(); |
| 598 | + const unsigned int blocks = GET_BLOCKS(threads, num_kernels); |
| 599 | + |
588 | 600 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( |
589 | 601 | columns.scalar_type(), "deformable_col2im_gpu", ([&] { |
590 | 602 | deformable_col2im_gpu_kernel<<< |
591 | | - GET_BLOCKS(num_kernels), |
592 | | - CUDA_NUM_THREADS>>>( |
| 603 | + blocks, |
| 604 | + threads>>>( |
593 | 605 | num_kernels, |
594 | 606 | columns.data_ptr<scalar_t>(), |
595 | 607 | offset.data_ptr<scalar_t>(), |
@@ -790,11 +802,14 @@ static void compute_grad_offset_and_mask( |
790 | 802 | int num_kernels = |
791 | 803 | out_h * out_w * 2 * weight_h * weight_w * n_offset_grps * parallel_imgs; |
792 | 804 |
|
| 805 | + const unsigned int threads = GET_THREADS(); |
| 806 | + const unsigned int blocks = GET_BLOCKS(threads, num_kernels); |
| 807 | + |
793 | 808 | AT_DISPATCH_FLOATING_TYPES_AND_HALF( |
794 | 809 | columns.scalar_type(), "deformable_col2im_coord_gpu", ([&] { |
795 | 810 | deformable_col2im_coord_gpu_kernel<<< |
796 | | - GET_BLOCKS(num_kernels), |
797 | | - CUDA_NUM_THREADS>>>( |
| 811 | + blocks, |
| 812 | + threads>>>( |
798 | 813 | num_kernels, |
799 | 814 | columns.data_ptr<scalar_t>(), |
800 | 815 | input.data_ptr<scalar_t>(), |
|
0 commit comments