From 54b853835af76286ee6c2ea05918c05e5da2828f Mon Sep 17 00:00:00 2001 From: = <=> Date: Sat, 10 Jul 2021 16:56:19 +0000 Subject: [PATCH 01/12] Clean up padding kernels --- nestedtensor/csrc/cuda/padding.cu | 54 +++++++++++++++++-------------- nestedtensor/csrc/masking.cpp | 10 +++--- nestedtensor/version.py | 4 +-- 3 files changed, 35 insertions(+), 33 deletions(-) diff --git a/nestedtensor/csrc/cuda/padding.cu b/nestedtensor/csrc/cuda/padding.cu index fb9550d3..10d1a137 100644 --- a/nestedtensor/csrc/cuda/padding.cu +++ b/nestedtensor/csrc/cuda/padding.cu @@ -92,7 +92,7 @@ void add_padding_2( } } -template +template __global__ void add_padding_3( const T* input, @@ -101,41 +101,42 @@ void add_padding_3( const int* offsets, const int* input_sizes, int input_dim, - const int* output_sizes, + int output_sizes_1, + int output_sizes_2, + int output_sizes_3, + int output_sizes_2_3, const int batch_size) { const int batch_id = blockIdx.x; const int grid_id = blockIdx.y; const int tid = threadIdx.x + grid_id * 256; - const int grainsize = 16 * 256; const int offset = offsets[batch_id]; const int* sizes_i = input_sizes + batch_id * input_dim; const int numel_i = sizes_i[0] * sizes_i[1] * sizes_i[2]; - const int output_offset = batch_id * output_sizes[1] * output_sizes[2] * output_sizes[3]; - const int output_numel = output_sizes[1] * output_sizes[2] * output_sizes[3]; + const int output_offset = batch_id * output_sizes_1 * output_sizes_2_3; + const int output_numel = output_sizes_1 * output_sizes_2_3; + const int sizes_0 = sizes_i[0]; + const int sizes_1 = sizes_i[1]; + const int sizes_2 = sizes_i[2]; + output = output + output_offset; + input = input + offset; for (int ii = 0; ii < (output_numel / grainsize); ii++) { const int i = ii * grainsize + tid; - const int i0 = i / (output_sizes[2] * output_sizes[3]); - const int i1 = (i % (output_sizes[2] * output_sizes[3])) / output_sizes[3]; - const int i2 = i % output_sizes[3]; - if (i0 < sizes_i[0] && i1 < sizes_i[1] && i2 < sizes_i[2]) { - const int input_offset = offset + i0 * (sizes_i[1] * sizes_i[2]) + i1 * sizes_i[2] + i2; - output[output_offset + i] = input[input_offset]; - } else { - output[output_offset + i] = padding_value; - } + const int i0 = i / (output_sizes_2_3); + const int i1 = (i % (output_sizes_2_3)) / output_sizes_3; + const int i2 = i % output_sizes_3; + bool valid = i0 < sizes_0 && i1 < sizes_1 && i2 < sizes_2; + const int input_offset = i0 * (sizes_1 * sizes_2) + i1 * sizes_2 + i2; + output[i] = valid ? input[input_offset] : padding_value; } const int i = (output_numel / grainsize) * grainsize + tid; if (i < output_numel) { - const int i0 = i / (output_sizes[2] * output_sizes[3]); - const int i1 = (i % (output_sizes[2] * output_sizes[3])) / output_sizes[3]; - const int i2 = i % output_sizes[3]; - if (i0 < sizes_i[0] && i1 < sizes_i[1] && i2 < sizes_i[2]) { - const int input_offset = offset + i0 * (sizes_i[1] * sizes_i[2]) + i1 * sizes_i[2] + i2; - output[output_offset + i] = input[input_offset]; - } else { - output[output_offset + i] = padding_value; - } + const int i0 = i / (output_sizes_2_3); + const int i1 = (i % (output_sizes_2_3)) / output_sizes_3; + const int i2 = i % output_sizes_3; + bool valid = i0 < sizes_0 && i1 < sizes_1 && i2 < sizes_2; + const int input_offset = i0 * (sizes_1 * sizes_2) + i1 * sizes_2 + i2; + output[i] = valid ? input[input_offset] : padding_value; } } @@ -177,14 +178,17 @@ void add_padding_kernelLauncher( batch_size); } if (input_dim == 3) { - add_padding_3<<>>( + add_padding_3<<>>( input, output, padding_value, offsets, input_sizes, input_dim, - output_sizes, + output_sizes[1], + output_sizes[2], + output_sizes[3], + output_sizes[2] * output_sizes[3], batch_size); } } diff --git a/nestedtensor/csrc/masking.cpp b/nestedtensor/csrc/masking.cpp index 2ca43b68..3a98cdcd 100644 --- a/nestedtensor/csrc/masking.cpp +++ b/nestedtensor/csrc/masking.cpp @@ -507,23 +507,21 @@ Tensor to_padded_tensor(Tensor nt, double padding) { std::vector new_size = padded_size_from_efficient_size(esize); at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream(); Tensor output = at::empty(IntArrayRef(new_size), nt_buffer.options()); - Tensor new_size_tensor = torch::tensor(new_size); + Tensor new_size_tensor = torch::tensor(new_size, torch::kInt32); int64_t input_dim = nt_sizes.size(1); int64_t batch_size = nt_sizes.size(0); - at::Tensor metadata = at::cat({new_size_tensor, offsets, nt_sizes.reshape(-1)}); + at::Tensor metadata = at::cat({offsets, nt_sizes.reshape(-1)}); metadata = metadata.to(at::Device(kCUDA), torch::kInt32, true, true); std::vector split_sizes; - split_sizes.push_back(new_size_tensor.numel()); split_sizes.push_back(offsets.numel()); split_sizes.push_back(nt_sizes.numel()); std::vector split = at::split_with_sizes(metadata, IntArrayRef(split_sizes), 0); - new_size_tensor = split[0]; - offsets = split[1]; - nt_sizes = split[2]; + offsets = split[0]; + nt_sizes = split[1]; if (nt_buffer.dtype() == torch::kFloat16) { nested_tensor::cuda::add_padding_kernelLauncher( diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 46a23e4b..1ff2f7f8 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.1.4+33fb247' -git_version = '33fb2477c856f8185f1e9c1e9a6ca28065e43cf9' +__version__ = '0.1.4+7be8164' +git_version = '7be8164298f17dd9f510af0155ab7c4c50359d4e' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From 7db1ebbc51501749b19f92c7f0e04ecb0bff7528 Mon Sep 17 00:00:00 2001 From: = <=> Date: Sat, 10 Jul 2021 17:09:46 +0000 Subject: [PATCH 02/12] Checkpoint --- nestedtensor/csrc/cuda/padding.cu | 4 ++-- nestedtensor/version.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nestedtensor/csrc/cuda/padding.cu b/nestedtensor/csrc/cuda/padding.cu index 10d1a137..8bbe102f 100644 --- a/nestedtensor/csrc/cuda/padding.cu +++ b/nestedtensor/csrc/cuda/padding.cu @@ -120,16 +120,16 @@ void add_padding_3( const int sizes_2 = sizes_i[2]; output = output + output_offset; input = input + offset; + int i = tid; for (int ii = 0; ii < (output_numel / grainsize); ii++) { - const int i = ii * grainsize + tid; const int i0 = i / (output_sizes_2_3); const int i1 = (i % (output_sizes_2_3)) / output_sizes_3; const int i2 = i % output_sizes_3; bool valid = i0 < sizes_0 && i1 < sizes_1 && i2 < sizes_2; const int input_offset = i0 * (sizes_1 * sizes_2) + i1 * sizes_2 + i2; output[i] = valid ? input[input_offset] : padding_value; + i += grainsize; } - const int i = (output_numel / grainsize) * grainsize + tid; if (i < output_numel) { const int i0 = i / (output_sizes_2_3); const int i1 = (i % (output_sizes_2_3)) / output_sizes_3; diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 1ff2f7f8..119d16cb 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.1.4+7be8164' -git_version = '7be8164298f17dd9f510af0155ab7c4c50359d4e' +__version__ = '0.1.4+54b8538' +git_version = '54b853835af76286ee6c2ea05918c05e5da2828f' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From b0557d7c694e1406c6a0a6cff323adf03200b4df Mon Sep 17 00:00:00 2001 From: = <=> Date: Sat, 10 Jul 2021 17:12:36 +0000 Subject: [PATCH 03/12] Checkpoint --- nestedtensor/csrc/cuda/padding.cu | 5 +++-- nestedtensor/version.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/nestedtensor/csrc/cuda/padding.cu b/nestedtensor/csrc/cuda/padding.cu index 8bbe102f..abf71e92 100644 --- a/nestedtensor/csrc/cuda/padding.cu +++ b/nestedtensor/csrc/cuda/padding.cu @@ -118,6 +118,7 @@ void add_padding_3( const int sizes_0 = sizes_i[0]; const int sizes_1 = sizes_i[1]; const int sizes_2 = sizes_i[2]; + const int sizes_1_2 = sizes_1 * sizes_2; output = output + output_offset; input = input + offset; int i = tid; @@ -126,7 +127,7 @@ void add_padding_3( const int i1 = (i % (output_sizes_2_3)) / output_sizes_3; const int i2 = i % output_sizes_3; bool valid = i0 < sizes_0 && i1 < sizes_1 && i2 < sizes_2; - const int input_offset = i0 * (sizes_1 * sizes_2) + i1 * sizes_2 + i2; + const int input_offset = i0 * (sizes_1_2) + i1 * sizes_2 + i2; output[i] = valid ? input[input_offset] : padding_value; i += grainsize; } @@ -135,7 +136,7 @@ void add_padding_3( const int i1 = (i % (output_sizes_2_3)) / output_sizes_3; const int i2 = i % output_sizes_3; bool valid = i0 < sizes_0 && i1 < sizes_1 && i2 < sizes_2; - const int input_offset = i0 * (sizes_1 * sizes_2) + i1 * sizes_2 + i2; + const int input_offset = i0 * (sizes_1_2) + i1 * sizes_2 + i2; output[i] = valid ? input[input_offset] : padding_value; } } diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 119d16cb..4060a3f9 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.1.4+54b8538' -git_version = '54b853835af76286ee6c2ea05918c05e5da2828f' +__version__ = '0.1.4+7db1ebb' +git_version = '7db1ebbc51501749b19f92c7f0e04ecb0bff7528' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From 6a98344c3b5c0a0c1422b3e6e7adcac46e7712cc Mon Sep 17 00:00:00 2001 From: = <=> Date: Sat, 10 Jul 2021 17:29:22 +0000 Subject: [PATCH 04/12] Checkpoint --- nestedtensor/csrc/cuda/padding.cu | 5 +++-- nestedtensor/version.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/nestedtensor/csrc/cuda/padding.cu b/nestedtensor/csrc/cuda/padding.cu index abf71e92..6fbc4492 100644 --- a/nestedtensor/csrc/cuda/padding.cu +++ b/nestedtensor/csrc/cuda/padding.cu @@ -105,6 +105,7 @@ void add_padding_3( int output_sizes_2, int output_sizes_3, int output_sizes_2_3, + int output_numel, const int batch_size) { const int batch_id = blockIdx.x; @@ -113,8 +114,7 @@ void add_padding_3( const int offset = offsets[batch_id]; const int* sizes_i = input_sizes + batch_id * input_dim; const int numel_i = sizes_i[0] * sizes_i[1] * sizes_i[2]; - const int output_offset = batch_id * output_sizes_1 * output_sizes_2_3; - const int output_numel = output_sizes_1 * output_sizes_2_3; + const int output_offset = batch_id * output_numel; const int sizes_0 = sizes_i[0]; const int sizes_1 = sizes_i[1]; const int sizes_2 = sizes_i[2]; @@ -190,6 +190,7 @@ void add_padding_kernelLauncher( output_sizes[2], output_sizes[3], output_sizes[2] * output_sizes[3], + output_sizes[1] * output_sizes[2] * output_sizes[3], batch_size); } } diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 4060a3f9..fa0f9bd7 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.1.4+7db1ebb' -git_version = '7db1ebbc51501749b19f92c7f0e04ecb0bff7528' +__version__ = '0.1.4+b0557d7' +git_version = 'b0557d7c694e1406c6a0a6cff323adf03200b4df' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From b4af6546cc11bf8458a0c69fb09d92f961eab014 Mon Sep 17 00:00:00 2001 From: = <=> Date: Sun, 11 Jul 2021 05:34:42 +0000 Subject: [PATCH 05/12] Checkpoint --- nestedtensor/csrc/cuda/padding.cu | 10 +--------- nestedtensor/version.py | 4 ++-- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/nestedtensor/csrc/cuda/padding.cu b/nestedtensor/csrc/cuda/padding.cu index 6fbc4492..80afb925 100644 --- a/nestedtensor/csrc/cuda/padding.cu +++ b/nestedtensor/csrc/cuda/padding.cu @@ -122,7 +122,7 @@ void add_padding_3( output = output + output_offset; input = input + offset; int i = tid; - for (int ii = 0; ii < (output_numel / grainsize); ii++) { + for (;i < output_numel;) { const int i0 = i / (output_sizes_2_3); const int i1 = (i % (output_sizes_2_3)) / output_sizes_3; const int i2 = i % output_sizes_3; @@ -131,14 +131,6 @@ void add_padding_3( output[i] = valid ? input[input_offset] : padding_value; i += grainsize; } - if (i < output_numel) { - const int i0 = i / (output_sizes_2_3); - const int i1 = (i % (output_sizes_2_3)) / output_sizes_3; - const int i2 = i % output_sizes_3; - bool valid = i0 < sizes_0 && i1 < sizes_1 && i2 < sizes_2; - const int input_offset = i0 * (sizes_1_2) + i1 * sizes_2 + i2; - output[i] = valid ? input[input_offset] : padding_value; - } } template diff --git a/nestedtensor/version.py b/nestedtensor/version.py index fa0f9bd7..794286d5 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.1.4+b0557d7' -git_version = 'b0557d7c694e1406c6a0a6cff323adf03200b4df' +__version__ = '0.1.4+6a98344' +git_version = '6a98344c3b5c0a0c1422b3e6e7adcac46e7712cc' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From 1fc28deca4c53569c7e43468617a3ea60e38d4fa Mon Sep 17 00:00:00 2001 From: = <=> Date: Sun, 11 Jul 2021 05:38:51 +0000 Subject: [PATCH 06/12] Checkpoint --- nestedtensor/csrc/cuda/padding.cu | 16 ++++++++-------- nestedtensor/version.py | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/nestedtensor/csrc/cuda/padding.cu b/nestedtensor/csrc/cuda/padding.cu index 80afb925..fe12c53e 100644 --- a/nestedtensor/csrc/cuda/padding.cu +++ b/nestedtensor/csrc/cuda/padding.cu @@ -19,8 +19,8 @@ void add_padding_1( const int* output_sizes, const int batch_size) { - const int batch_id = blockIdx.x; - const int grid_id = blockIdx.y; + const int batch_id = blockIdx.y; + const int grid_id = blockIdx.x; const int tid = threadIdx.x + grid_id * 256; const int grainsize = 16 * 256; const int batch_input_offset = offsets[batch_id]; @@ -59,8 +59,8 @@ void add_padding_2( const int* output_sizes, const int batch_size) { - const int batch_id = blockIdx.x; - const int grid_id = blockIdx.y; + const int batch_id = blockIdx.y; + const int grid_id = blockIdx.x; const int tid = threadIdx.x + grid_id * 256; const int grainsize = 16 * 256; const int offset = offsets[batch_id]; @@ -108,8 +108,8 @@ void add_padding_3( int output_numel, const int batch_size) { - const int batch_id = blockIdx.x; - const int grid_id = blockIdx.y; + const int batch_id = blockIdx.y; + const int grid_id = blockIdx.x; const int tid = threadIdx.x + grid_id * 256; const int offset = offsets[batch_id]; const int* sizes_i = input_sizes + batch_id * input_dim; @@ -146,8 +146,8 @@ void add_padding_kernelLauncher( const cudaStream_t stream) { dim3 grid; - grid.x = batch_size; - grid.y = 16; + grid.x = 16; + grid.y = batch_size; if (input_dim == 1) { add_padding_1<<>>( input, diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 794286d5..c19ea02a 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.1.4+6a98344' -git_version = '6a98344c3b5c0a0c1422b3e6e7adcac46e7712cc' +__version__ = '0.1.4+b4af654' +git_version = 'b4af6546cc11bf8458a0c69fb09d92f961eab014' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From 31fbd424487e4edcf38fc5a76fa31808515e990d Mon Sep 17 00:00:00 2001 From: = <=> Date: Sun, 11 Jul 2021 05:41:11 +0000 Subject: [PATCH 07/12] Checkpoint --- nestedtensor/csrc/cuda/padding.cu | 7 ++----- nestedtensor/version.py | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/nestedtensor/csrc/cuda/padding.cu b/nestedtensor/csrc/cuda/padding.cu index fe12c53e..0db673f1 100644 --- a/nestedtensor/csrc/cuda/padding.cu +++ b/nestedtensor/csrc/cuda/padding.cu @@ -111,16 +111,13 @@ void add_padding_3( const int batch_id = blockIdx.y; const int grid_id = blockIdx.x; const int tid = threadIdx.x + grid_id * 256; - const int offset = offsets[batch_id]; const int* sizes_i = input_sizes + batch_id * input_dim; - const int numel_i = sizes_i[0] * sizes_i[1] * sizes_i[2]; - const int output_offset = batch_id * output_numel; const int sizes_0 = sizes_i[0]; const int sizes_1 = sizes_i[1]; const int sizes_2 = sizes_i[2]; const int sizes_1_2 = sizes_1 * sizes_2; - output = output + output_offset; - input = input + offset; + output = output + batch_id * output_numel; + input = input + offsets[batch_id]; int i = tid; for (;i < output_numel;) { const int i0 = i / (output_sizes_2_3); diff --git a/nestedtensor/version.py b/nestedtensor/version.py index c19ea02a..14e0441e 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.1.4+b4af654' -git_version = 'b4af6546cc11bf8458a0c69fb09d92f961eab014' +__version__ = '0.1.4+1fc28de' +git_version = '1fc28deca4c53569c7e43468617a3ea60e38d4fa' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From 04095b1f776a4fcf55f6869c7128d4e6bd8fe20f Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 12 Jul 2021 16:26:50 +0000 Subject: [PATCH 08/12] Checkpoint --- nestedtensor/csrc/cuda/padding.cu | 32 +++++++++++++------------------ nestedtensor/version.py | 4 ++-- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/nestedtensor/csrc/cuda/padding.cu b/nestedtensor/csrc/cuda/padding.cu index 0db673f1..1251229a 100644 --- a/nestedtensor/csrc/cuda/padding.cu +++ b/nestedtensor/csrc/cuda/padding.cu @@ -101,30 +101,27 @@ void add_padding_3( const int* offsets, const int* input_sizes, int input_dim, - int output_sizes_1, - int output_sizes_2, int output_sizes_3, int output_sizes_2_3, - int output_numel, - const int batch_size) + int output_numel) { const int batch_id = blockIdx.y; - const int grid_id = blockIdx.x; - const int tid = threadIdx.x + grid_id * 256; + const int i0 = blockIdx.x; + const int tid = threadIdx.x; const int* sizes_i = input_sizes + batch_id * input_dim; const int sizes_0 = sizes_i[0]; const int sizes_1 = sizes_i[1]; const int sizes_2 = sizes_i[2]; const int sizes_1_2 = sizes_1 * sizes_2; - output = output + batch_id * output_numel; - input = input + offsets[batch_id]; + output = output + batch_id * output_numel + i0 * output_sizes_2_3; + input = input + offsets[batch_id] + i0 * sizes_1_2; int i = tid; - for (;i < output_numel;) { - const int i0 = i / (output_sizes_2_3); - const int i1 = (i % (output_sizes_2_3)) / output_sizes_3; + bool valid_0 = i0 < sizes_0; + for (;i < output_sizes_2_3;) { + const int i1 = i / output_sizes_3; const int i2 = i % output_sizes_3; - bool valid = i0 < sizes_0 && i1 < sizes_1 && i2 < sizes_2; - const int input_offset = i0 * (sizes_1_2) + i1 * sizes_2 + i2; + const bool valid = valid_0 && i1 < sizes_1 && i2 < sizes_2; + const int input_offset = valid ? i1 * sizes_2 + i2 : 0; output[i] = valid ? input[input_offset] : padding_value; i += grainsize; } @@ -143,7 +140,7 @@ void add_padding_kernelLauncher( const cudaStream_t stream) { dim3 grid; - grid.x = 16; + grid.x = output_sizes[1]; grid.y = batch_size; if (input_dim == 1) { add_padding_1<<>>( @@ -168,19 +165,16 @@ void add_padding_kernelLauncher( batch_size); } if (input_dim == 3) { - add_padding_3<<>>( + add_padding_3<<>>( input, output, padding_value, offsets, input_sizes, input_dim, - output_sizes[1], - output_sizes[2], output_sizes[3], output_sizes[2] * output_sizes[3], - output_sizes[1] * output_sizes[2] * output_sizes[3], - batch_size); + output_sizes[1] * output_sizes[2] * output_sizes[3]); } } diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 14e0441e..6016ec68 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.1.4+1fc28de' -git_version = '1fc28deca4c53569c7e43468617a3ea60e38d4fa' +__version__ = '0.1.4+31fbd42' +git_version = '31fbd424487e4edcf38fc5a76fa31808515e990d' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From 6b987e64cba95e468f2334e2920cc74d13096c68 Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 12 Jul 2021 16:28:28 +0000 Subject: [PATCH 09/12] Checkpoint --- nestedtensor/csrc/cuda/padding.cu | 8 ++++---- nestedtensor/version.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/nestedtensor/csrc/cuda/padding.cu b/nestedtensor/csrc/cuda/padding.cu index 1251229a..7e172983 100644 --- a/nestedtensor/csrc/cuda/padding.cu +++ b/nestedtensor/csrc/cuda/padding.cu @@ -105,8 +105,8 @@ void add_padding_3( int output_sizes_2_3, int output_numel) { - const int batch_id = blockIdx.y; - const int i0 = blockIdx.x; + const int batch_id = blockIdx.x; + const int i0 = blockIdx.y; const int tid = threadIdx.x; const int* sizes_i = input_sizes + batch_id * input_dim; const int sizes_0 = sizes_i[0]; @@ -140,8 +140,8 @@ void add_padding_kernelLauncher( const cudaStream_t stream) { dim3 grid; - grid.x = output_sizes[1]; - grid.y = batch_size; + grid.x = batch_size; + grid.y = output_sizes[1]; if (input_dim == 1) { add_padding_1<<>>( input, diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 6016ec68..7981a129 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.1.4+31fbd42' -git_version = '31fbd424487e4edcf38fc5a76fa31808515e990d' +__version__ = '0.1.4+04095b1' +git_version = '04095b1f776a4fcf55f6869c7128d4e6bd8fe20f' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From 557edb6fb85b438bcccba1ab3f1c39a4a8d13961 Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 12 Jul 2021 17:07:58 +0000 Subject: [PATCH 10/12] Checkpoint --- nestedtensor/csrc/cuda/padding.cu | 9 ++++++++- nestedtensor/version.py | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/nestedtensor/csrc/cuda/padding.cu b/nestedtensor/csrc/cuda/padding.cu index 7e172983..76f46288 100644 --- a/nestedtensor/csrc/cuda/padding.cu +++ b/nestedtensor/csrc/cuda/padding.cu @@ -117,14 +117,21 @@ void add_padding_3( input = input + offsets[batch_id] + i0 * sizes_1_2; int i = tid; bool valid_0 = i0 < sizes_0; + if (valid_0) { for (;i < output_sizes_2_3;) { const int i1 = i / output_sizes_3; const int i2 = i % output_sizes_3; - const bool valid = valid_0 && i1 < sizes_1 && i2 < sizes_2; + const bool valid = i1 < sizes_1 && i2 < sizes_2; const int input_offset = valid ? i1 * sizes_2 + i2 : 0; output[i] = valid ? input[input_offset] : padding_value; i += grainsize; } + } else { + for (;i < output_sizes_2_3;) { + output[i] = padding_value; + i += grainsize; + } + } } template diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 7981a129..28b37d6c 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.1.4+04095b1' -git_version = '04095b1f776a4fcf55f6869c7128d4e6bd8fe20f' +__version__ = '0.1.4+6b987e6' +git_version = '6b987e64cba95e468f2334e2920cc74d13096c68' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From 2719e6833bcdec69084953381aa05e53e9df9baa Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 12 Jul 2021 17:11:17 +0000 Subject: [PATCH 11/12] Checkpoint --- nestedtensor/csrc/cuda/padding.cu | 38 +++++++++++++++---------------- nestedtensor/version.py | 4 ++-- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/nestedtensor/csrc/cuda/padding.cu b/nestedtensor/csrc/cuda/padding.cu index 76f46288..4359843a 100644 --- a/nestedtensor/csrc/cuda/padding.cu +++ b/nestedtensor/csrc/cuda/padding.cu @@ -110,27 +110,27 @@ void add_padding_3( const int tid = threadIdx.x; const int* sizes_i = input_sizes + batch_id * input_dim; const int sizes_0 = sizes_i[0]; - const int sizes_1 = sizes_i[1]; - const int sizes_2 = sizes_i[2]; - const int sizes_1_2 = sizes_1 * sizes_2; - output = output + batch_id * output_numel + i0 * output_sizes_2_3; - input = input + offsets[batch_id] + i0 * sizes_1_2; int i = tid; - bool valid_0 = i0 < sizes_0; - if (valid_0) { - for (;i < output_sizes_2_3;) { - const int i1 = i / output_sizes_3; - const int i2 = i % output_sizes_3; - const bool valid = i1 < sizes_1 && i2 < sizes_2; - const int input_offset = valid ? i1 * sizes_2 + i2 : 0; - output[i] = valid ? input[input_offset] : padding_value; - i += grainsize; - } + if (i0 < sizes_0) { + const int sizes_1 = sizes_i[1]; + const int sizes_2 = sizes_i[2]; + const int sizes_1_2 = sizes_1 * sizes_2; + output = output + batch_id * output_numel + i0 * output_sizes_2_3; + input = input + offsets[batch_id] + i0 * sizes_1_2; + bool valid_0 = i0 < sizes_0; + for (;i < output_sizes_2_3;) { + const int i1 = i / output_sizes_3; + const int i2 = i % output_sizes_3; + const bool valid = i1 < sizes_1 && i2 < sizes_2; + const int input_offset = valid ? i1 * sizes_2 + i2 : 0; + output[i] = valid ? input[input_offset] : padding_value; + i += grainsize; + } } else { - for (;i < output_sizes_2_3;) { - output[i] = padding_value; - i += grainsize; - } + for (;i < output_sizes_2_3;) { + output[i] = padding_value; + i += grainsize; + } } } diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 28b37d6c..ed5ecd48 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.1.4+6b987e6' -git_version = '6b987e64cba95e468f2334e2920cc74d13096c68' +__version__ = '0.1.4+557edb6' +git_version = '557edb6fb85b438bcccba1ab3f1c39a4a8d13961' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From c83a58a96b577031c5d65df35f5dc65acfb2a9ab Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 12 Jul 2021 17:14:05 +0000 Subject: [PATCH 12/12] Checkpoint --- nestedtensor/csrc/cuda/padding.cu | 2 +- nestedtensor/version.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nestedtensor/csrc/cuda/padding.cu b/nestedtensor/csrc/cuda/padding.cu index 4359843a..61ae6166 100644 --- a/nestedtensor/csrc/cuda/padding.cu +++ b/nestedtensor/csrc/cuda/padding.cu @@ -111,11 +111,11 @@ void add_padding_3( const int* sizes_i = input_sizes + batch_id * input_dim; const int sizes_0 = sizes_i[0]; int i = tid; + output = output + batch_id * output_numel + i0 * output_sizes_2_3; if (i0 < sizes_0) { const int sizes_1 = sizes_i[1]; const int sizes_2 = sizes_i[2]; const int sizes_1_2 = sizes_1 * sizes_2; - output = output + batch_id * output_numel + i0 * output_sizes_2_3; input = input + offsets[batch_id] + i0 * sizes_1_2; bool valid_0 = i0 < sizes_0; for (;i < output_sizes_2_3;) { diff --git a/nestedtensor/version.py b/nestedtensor/version.py index ed5ecd48..8ec67334 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.1.4+557edb6' -git_version = '557edb6fb85b438bcccba1ab3f1c39a4a8d13961' +__version__ = '0.1.4+2719e68' +git_version = '2719e6833bcdec69084953381aa05e53e9df9baa' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION