From 12e5756e55e875a83cef0808092ea004a7e01641 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Thu, 8 Oct 2020 17:19:52 -0400 Subject: [PATCH 1/2] [SYCL][CUDA] Add sub-group shuffles Sub-group shuffles map to one of the following intrinsics: - __nvvm_shfl_sync_idx_i32 - __nvvm_shfl_sync_up_i32 - __nvvm_shfl_sync_down_i32 - __nvvm_shfl_sync_xor_i32 Implemented in the SYCL headers instead of libclc for two reasons: 1) The SPIR-V implementation uses an extension (__spirv_SubgroupShuffleINTEL) 2) We currently need to use enable_if to generate different instruction sequences for some types, and these cases differ between SPIR-V/PTX. Signed-off-by: John Pennycook --- sycl/include/CL/sycl/detail/spirv.hpp | 170 ++++++++++++++++++++++---- 1 file changed, 149 insertions(+), 21 deletions(-) diff --git a/sycl/include/CL/sycl/detail/spirv.hpp b/sycl/include/CL/sycl/detail/spirv.hpp index 1e94c3272e87b..a50c58e366199 100644 --- a/sycl/include/CL/sycl/detail/spirv.hpp +++ b/sycl/include/CL/sycl/detail/spirv.hpp @@ -35,29 +35,41 @@ template <> struct group_scope<::cl::sycl::ONEAPI::sub_group> { static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup; }; -// Generic shuffles and broadcasts may require multiple calls to SPIR-V +// Generic shuffles and broadcasts may require multiple calls to // intrinsics, and should use the fewest broadcasts possible -// - Loop over 64-bit chunks until remaining bytes < 64-bit +// - Loop over chunks until remaining bytes < chunk size // - At most one 32-bit, 16-bit and 8-bit chunk left over +#ifndef __NVPTX__ +using ShuffleChunkT = uint64_t; +#else +using ShuffleChunkT = uint32_t; +#endif template void GenericCall(const Functor &ApplyToBytes) { - if (sizeof(T) >= sizeof(uint64_t)) { + if (sizeof(T) >= sizeof(ShuffleChunkT)) { #pragma unroll - for (size_t Offset = 0; Offset < sizeof(T); Offset += sizeof(uint64_t)) { - ApplyToBytes(Offset, sizeof(uint64_t)); + for (size_t Offset = 0; Offset < sizeof(T); + Offset += sizeof(ShuffleChunkT)) { + ApplyToBytes(Offset, sizeof(ShuffleChunkT)); } } - if (sizeof(T) % sizeof(uint64_t) >= sizeof(uint32_t)) { - size_t Offset = sizeof(T) / sizeof(uint64_t) * sizeof(uint64_t); - ApplyToBytes(Offset, sizeof(uint32_t)); + if (sizeof(ShuffleChunkT) >= sizeof(uint64_t)) { + if (sizeof(T) % sizeof(uint64_t) >= sizeof(uint32_t)) { + size_t Offset = sizeof(T) / sizeof(uint64_t) * sizeof(uint64_t); + ApplyToBytes(Offset, sizeof(uint32_t)); + } } - if (sizeof(T) % sizeof(uint32_t) >= sizeof(uint16_t)) { - size_t Offset = sizeof(T) / sizeof(uint32_t) * sizeof(uint32_t); - ApplyToBytes(Offset, sizeof(uint16_t)); + if (sizeof(ShuffleChunkT) >= sizeof(uint32_t)) { + if (sizeof(T) % sizeof(uint32_t) >= sizeof(uint16_t)) { + size_t Offset = sizeof(T) / sizeof(uint32_t) * sizeof(uint32_t); + ApplyToBytes(Offset, sizeof(uint16_t)); + } } - if (sizeof(T) % sizeof(uint16_t) >= sizeof(uint8_t)) { - size_t Offset = sizeof(T) / sizeof(uint16_t) * sizeof(uint16_t); - ApplyToBytes(Offset, sizeof(uint8_t)); + if (sizeof(ShuffleChunkT) >= sizeof(uint16_t)) { + if (sizeof(T) % sizeof(uint16_t) >= sizeof(uint8_t)) { + size_t Offset = sizeof(T) / sizeof(uint16_t) * sizeof(uint16_t); + ApplyToBytes(Offset, sizeof(uint8_t)); + } } } @@ -423,41 +435,118 @@ AtomicMax(multi_ptr MPtr, ONEAPI::memory_scope Scope, return __spirv_AtomicMax(Ptr, SPIRVScope, SPIRVOrder, Value); } -// Native shuffles map directly to a SPIR-V SubgroupShuffle intrinsic +// Native shuffles map directly to a shuffle intrinsic: +// - The Intel SPIR-V extension natively supports all arithmetic types +// - The CUDA shfl intrinsics do not support vectors, and we use the _i32 +// variants for all scalar types +#ifndef __NVPTX__ template using EnableIfNativeShuffle = detail::enable_if_t::value, T>; +#else +template +using EnableIfNativeShuffle = detail::enable_if_t< + std::is_integral::value && (sizeof(T) <= sizeof(int32_t)), T>; + +template +using EnableIfVectorShuffle = + detail::enable_if_t::value, T>; +#endif + +#ifdef __NVPTX__ +inline uint32_t membermask() { + uint32_t FULL_MASK = 0xFFFFFFFF; + uint32_t max_size = __spirv_SubgroupMaxSize(); + uint32_t sg_size = __spirv_SubgroupSize(); + return FULL_MASK >> (max_size - sg_size); +} +#endif template EnableIfNativeShuffle SubgroupShuffle(T x, id<1> local_id) { +#ifndef __NVPTX__ using OCLT = detail::ConvertToOpenCLType_t; return __spirv_SubgroupShuffleINTEL(OCLT(x), static_cast(local_id.get(0))); +#else + return __nvvm_shfl_sync_idx_i32(membermask(), x, local_id.get(0), 0x1f); +#endif } template EnableIfNativeShuffle SubgroupShuffleXor(T x, id<1> local_id) { +#ifndef __NVPTX__ using OCLT = detail::ConvertToOpenCLType_t; return __spirv_SubgroupShuffleXorINTEL( OCLT(x), static_cast(local_id.get(0))); +#else + return __nvvm_shfl_sync_bfly_i32(membermask(), x, local_id.get(0), 0x1f); +#endif } template EnableIfNativeShuffle SubgroupShuffleDown(T x, id<1> local_id) { +#ifndef __NVPTX__ using OCLT = detail::ConvertToOpenCLType_t; return __spirv_SubgroupShuffleDownINTEL( OCLT(x), OCLT(x), static_cast(local_id.get(0))); +#else + return __nvvm_shfl_sync_down_i32(membermask(), x, local_id.get(0), 0x1f); +#endif } template EnableIfNativeShuffle SubgroupShuffleUp(T x, id<1> local_id) { +#ifndef __NVPTX__ using OCLT = detail::ConvertToOpenCLType_t; return __spirv_SubgroupShuffleUpINTEL(OCLT(x), OCLT(x), static_cast(local_id.get(0))); +#else + return __nvvm_shfl_sync_up_i32(membermask(), x, local_id.get(0), 0); +#endif } -// Bitcast shuffles can be implemented using a single SPIR-V SubgroupShuffle +#ifdef __NVPTX__ +template +EnableIfVectorShuffle SubgroupShuffle(T x, id<1> local_id) { + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = SubgroupShuffle(x[s], local_id); + } + return result; +} + +template +EnableIfVectorShuffle SubgroupShuffleXor(T x, id<1> local_id) { + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = SubgroupShuffleXor(x[s], local_id); + } + return result; +} + +template +EnableIfVectorShuffle SubgroupShuffleDown(T x, id<1> local_id) { + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = SubgroupShuffleDown(x[s], local_id); + } + return result; +} + +template +EnableIfVectorShuffle SubgroupShuffleUp(T x, id<1> local_id) { + T result; + for (int s = 0; s < x.get_size(); ++s) { + result[s] = SubgroupShuffleUp(x[s], local_id); + } + return result; +} +#endif + +// Bitcast shuffles can be implemented using a single SubgroupShuffle // intrinsic, but require type-punning via an appropriate integer type +#ifndef __NVPTX__ template using EnableIfBitcastShuffle = detail::enable_if_t::value && @@ -465,6 +554,15 @@ using EnableIfBitcastShuffle = (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8)), T>; +#else +template +using EnableIfBitcastShuffle = detail::enable_if_t< + !(std::is_integral::value && (sizeof(T) <= sizeof(int32_t))) && + !detail::is_vector_arithmetic::value && + (std::is_trivially_copyable::value && + (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4)), + T>; +#endif template using ConvertToNativeShuffleType_t = select_cl_scalar_integral_unsigned_t; @@ -473,8 +571,13 @@ template EnableIfBitcastShuffle SubgroupShuffle(T x, id<1> local_id) { using ShuffleT = ConvertToNativeShuffleType_t; auto ShuffleX = detail::bit_cast(x); +#ifndef __NVPTX__ ShuffleT Result = __spirv_SubgroupShuffleINTEL( ShuffleX, static_cast(local_id.get(0))); +#else + ShuffleT Result = + __nvvm_shfl_sync_idx_i32(membermask(), ShuffleX, local_id.get(0), 0x1f); +#endif return detail::bit_cast(Result); } @@ -482,8 +585,13 @@ template EnableIfBitcastShuffle SubgroupShuffleXor(T x, id<1> local_id) { using ShuffleT = ConvertToNativeShuffleType_t; auto ShuffleX = detail::bit_cast(x); +#ifndef __NVPTX__ ShuffleT Result = __spirv_SubgroupShuffleXorINTEL( ShuffleX, static_cast(local_id.get(0))); +#else + ShuffleT Result = + __nvvm_shfl_sync_bfly_i32(membermask(), ShuffleX, local_id.get(0), 0x1f); +#endif return detail::bit_cast(Result); } @@ -491,8 +599,13 @@ template EnableIfBitcastShuffle SubgroupShuffleDown(T x, id<1> local_id) { using ShuffleT = ConvertToNativeShuffleType_t; auto ShuffleX = detail::bit_cast(x); +#ifndef __NVPTX__ ShuffleT Result = __spirv_SubgroupShuffleDownINTEL( ShuffleX, ShuffleX, static_cast(local_id.get(0))); +#else + ShuffleT Result = + __nvvm_shfl_sync_down_i32(membermask(), ShuffleX, local_id.get(0), 0x1f); +#endif return detail::bit_cast(Result); } @@ -500,15 +613,21 @@ template EnableIfBitcastShuffle SubgroupShuffleUp(T x, id<1> local_id) { using ShuffleT = ConvertToNativeShuffleType_t; auto ShuffleX = detail::bit_cast(x); +#ifndef __NVPTX__ ShuffleT Result = __spirv_SubgroupShuffleUpINTEL( ShuffleX, ShuffleX, static_cast(local_id.get(0))); +#else + ShuffleT Result = + __nvvm_shfl_sync_up_i32(membermask(), ShuffleX, local_id.get(0), 0); +#endif return detail::bit_cast(Result); } -// Generic shuffles may require multiple calls to SPIR-V SubgroupShuffle +// Generic shuffles may require multiple calls to SubgroupShuffle // intrinsics, and should use the fewest shuffles possible: // - Loop over 64-bit chunks until remaining bytes < 64-bit // - At most one 32-bit, 16-bit and 8-bit chunk left over +#ifndef __NVPTX__ template using EnableIfGenericShuffle = detail::enable_if_t::value && @@ -516,6 +635,15 @@ using EnableIfGenericShuffle = (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8)), T>; +#else +template +using EnableIfGenericShuffle = detail::enable_if_t< + !(std::is_integral::value && (sizeof(T) <= sizeof(int32_t))) && + !detail::is_vector_arithmetic::value && + !(std::is_trivially_copyable::value && + (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4)), + T>; +#endif template EnableIfGenericShuffle SubgroupShuffle(T x, id<1> local_id) { @@ -523,7 +651,7 @@ EnableIfGenericShuffle SubgroupShuffle(T x, id<1> local_id) { char *XBytes = reinterpret_cast(&x); char *ResultBytes = reinterpret_cast(&Result); auto ShuffleBytes = [=](size_t Offset, size_t Size) { - uint64_t ShuffleX, ShuffleResult; + ShuffleChunkT ShuffleX, ShuffleResult; detail::memcpy(&ShuffleX, XBytes + Offset, Size); ShuffleResult = SubgroupShuffle(ShuffleX, local_id); detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size); @@ -538,7 +666,7 @@ EnableIfGenericShuffle SubgroupShuffleXor(T x, id<1> local_id) { char *XBytes = reinterpret_cast(&x); char *ResultBytes = reinterpret_cast(&Result); auto ShuffleBytes = [=](size_t Offset, size_t Size) { - uint64_t ShuffleX, ShuffleResult; + ShuffleChunkT ShuffleX, ShuffleResult; detail::memcpy(&ShuffleX, XBytes + Offset, Size); ShuffleResult = SubgroupShuffleXor(ShuffleX, local_id); detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size); @@ -553,7 +681,7 @@ EnableIfGenericShuffle SubgroupShuffleDown(T x, id<1> local_id) { char *XBytes = reinterpret_cast(&x); char *ResultBytes = reinterpret_cast(&Result); auto ShuffleBytes = [=](size_t Offset, size_t Size) { - uint64_t ShuffleX, ShuffleResult; + ShuffleChunkT ShuffleX, ShuffleResult; detail::memcpy(&ShuffleX, XBytes + Offset, Size); ShuffleResult = SubgroupShuffleDown(ShuffleX, local_id); detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size); @@ -568,7 +696,7 @@ EnableIfGenericShuffle SubgroupShuffleUp(T x, id<1> local_id) { char *XBytes = reinterpret_cast(&x); char *ResultBytes = reinterpret_cast(&Result); auto ShuffleBytes = [=](size_t Offset, size_t Size) { - uint64_t ShuffleX, ShuffleResult; + ShuffleChunkT ShuffleX, ShuffleResult; detail::memcpy(&ShuffleX, XBytes + Offset, Size); ShuffleResult = SubgroupShuffleUp(ShuffleX, local_id); detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size); From e9d7cc279b7a0a974e10208483b7e88cac8af859 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Fri, 9 Oct 2020 12:13:40 -0400 Subject: [PATCH 2/2] [SYCL][CUDA] Enable sub-group shuffle tests Signed-off-by: John Pennycook --- sycl/test/sub_group/generic-shuffle.cpp | 5 +---- sycl/test/sub_group/shuffle.cpp | 11 +++-------- sycl/test/sub_group/shuffle_fp16.cpp | 5 +---- sycl/test/sub_group/shuffle_fp64.cpp | 5 +---- 4 files changed, 6 insertions(+), 20 deletions(-) diff --git a/sycl/test/sub_group/generic-shuffle.cpp b/sycl/test/sub_group/generic-shuffle.cpp index e6825750925fc..c9ece6143be97 100644 --- a/sycl/test/sub_group/generic-shuffle.cpp +++ b/sycl/test/sub_group/generic-shuffle.cpp @@ -1,6 +1,3 @@ -// UNSUPPORTED: cuda -// CUDA compilation and runtime do not yet support sub-groups. -// // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out @@ -216,7 +213,7 @@ void check_struct(queue &Queue, Generator &Gen, size_t G = 256, size_t L = 64) { int main() { queue Queue; - if (!Queue.get_device().has_extension("cl_intel_subgroups")) { + if (Queue.get_device().is_host()) { std::cout << "Skipping test\n"; return 0; } diff --git a/sycl/test/sub_group/shuffle.cpp b/sycl/test/sub_group/shuffle.cpp index 5207716148ef6..6d551a92bde7d 100644 --- a/sycl/test/sub_group/shuffle.cpp +++ b/sycl/test/sub_group/shuffle.cpp @@ -1,6 +1,3 @@ -// UNSUPPORTED: cuda -// CUDA compilation and runtime do not yet support sub-groups. -// // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out @@ -19,14 +16,12 @@ int main() { queue Queue; - if (!Queue.get_device().has_extension("cl_intel_subgroups")) { + if (Queue.get_device().is_host()) { std::cout << "Skipping test\n"; return 0; } - if (Queue.get_device().has_extension("cl_intel_subgroups_short")) { - check(Queue); - check(Queue); - } + check(Queue); + check(Queue); check(Queue); check(Queue); check(Queue); diff --git a/sycl/test/sub_group/shuffle_fp16.cpp b/sycl/test/sub_group/shuffle_fp16.cpp index 62f07fc612de8..ac0863a897fee 100644 --- a/sycl/test/sub_group/shuffle_fp16.cpp +++ b/sycl/test/sub_group/shuffle_fp16.cpp @@ -1,6 +1,3 @@ -// UNSUPPORTED: cuda -// CUDA compilation and runtime do not yet support sub-groups. -// // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // @@ -16,7 +13,7 @@ int main() { queue Queue; - if (!Queue.get_device().has_extension("cl_intel_subgroups")) { + if (Queue.get_device().is_host()) { std::cout << "Skipping test\n"; return 0; } diff --git a/sycl/test/sub_group/shuffle_fp64.cpp b/sycl/test/sub_group/shuffle_fp64.cpp index 3b1ed56907601..a1b153b4b8e36 100644 --- a/sycl/test/sub_group/shuffle_fp64.cpp +++ b/sycl/test/sub_group/shuffle_fp64.cpp @@ -1,6 +1,3 @@ -// UNSUPPORTED: cuda -// CUDA compilation and runtime do not yet support sub-groups. -// // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out @@ -19,7 +16,7 @@ int main() { queue Queue; - if (!Queue.get_device().has_extension("cl_intel_subgroups")) { + if (Queue.get_device().is_host()) { std::cout << "Skipping test\n"; return 0; }