From 12e5756e55e875a83cef0808092ea004a7e01641 Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Thu, 8 Oct 2020 17:19:52 -0400
Subject: [PATCH 1/2] [SYCL][CUDA] Add sub-group shuffles

Sub-group shuffles map to one of the following intrinsics:
- __nvvm_shfl_sync_idx_i32
- __nvvm_shfl_sync_up_i32
- __nvvm_shfl_sync_down_i32
- __nvvm_shfl_sync_xor_i32

Implemented in the SYCL headers instead of libclc for two reasons:
1) The SPIR-V implementation uses an extension (__spirv_SubgroupShuffleINTEL)
2) We currently need to use enable_if to generate different instruction
   sequences for some types, and these cases differ between SPIR-V/PTX.

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 sycl/include/CL/sycl/detail/spirv.hpp | 170 ++++++++++++++++++++++----
 1 file changed, 149 insertions(+), 21 deletions(-)
diff --git a/sycl/include/CL/sycl/detail/spirv.hpp b/sycl/include/CL/sycl/detail/spirv.hpp
index 1e94c3272e87b..a50c58e366199 100644
--- a/sycl/include/CL/sycl/detail/spirv.hpp
+++ b/sycl/include/CL/sycl/detail/spirv.hpp
@@ -35,29 +35,41 @@ template <> struct group_scope<::cl::sycl::ONEAPI::sub_group> {
   static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup;
 };
 
-// Generic shuffles and broadcasts may require multiple calls to SPIR-V
+// Generic shuffles and broadcasts may require multiple calls to
 // intrinsics, and should use the fewest broadcasts possible
-// - Loop over 64-bit chunks until remaining bytes < 64-bit
+// - Loop over chunks until remaining bytes < chunk size
 // - At most one 32-bit, 16-bit and 8-bit chunk left over
+#ifndef __NVPTX__
+using ShuffleChunkT = uint64_t;
+#else
+using ShuffleChunkT = uint32_t;
+#endif
 template <typename T, typename Functor>
 void GenericCall(const Functor &ApplyToBytes) {
-  if (sizeof(T) >= sizeof(uint64_t)) {
+  if (sizeof(T) >= sizeof(ShuffleChunkT)) {
 #pragma unroll
-    for (size_t Offset = 0; Offset < sizeof(T); Offset += sizeof(uint64_t)) {
-      ApplyToBytes(Offset, sizeof(uint64_t));
+    for (size_t Offset = 0; Offset < sizeof(T);
+         Offset += sizeof(ShuffleChunkT)) {
+      ApplyToBytes(Offset, sizeof(ShuffleChunkT));
     }
   }
-  if (sizeof(T) % sizeof(uint64_t) >= sizeof(uint32_t)) {
-    size_t Offset = sizeof(T) / sizeof(uint64_t) * sizeof(uint64_t);
-    ApplyToBytes(Offset, sizeof(uint32_t));
+  if (sizeof(ShuffleChunkT) >= sizeof(uint64_t)) {
+    if (sizeof(T) % sizeof(uint64_t) >= sizeof(uint32_t)) {
+      size_t Offset = sizeof(T) / sizeof(uint64_t) * sizeof(uint64_t);
+      ApplyToBytes(Offset, sizeof(uint32_t));
+    }
   }
-  if (sizeof(T) % sizeof(uint32_t) >= sizeof(uint16_t)) {
-    size_t Offset = sizeof(T) / sizeof(uint32_t) * sizeof(uint32_t);
-    ApplyToBytes(Offset, sizeof(uint16_t));
+  if (sizeof(ShuffleChunkT) >= sizeof(uint32_t)) {
+    if (sizeof(T) % sizeof(uint32_t) >= sizeof(uint16_t)) {
+      size_t Offset = sizeof(T) / sizeof(uint32_t) * sizeof(uint32_t);
+      ApplyToBytes(Offset, sizeof(uint16_t));
+    }
   }
-  if (sizeof(T) % sizeof(uint16_t) >= sizeof(uint8_t)) {
-    size_t Offset = sizeof(T) / sizeof(uint16_t) * sizeof(uint16_t);
-    ApplyToBytes(Offset, sizeof(uint8_t));
+  if (sizeof(ShuffleChunkT) >= sizeof(uint16_t)) {
+    if (sizeof(T) % sizeof(uint16_t) >= sizeof(uint8_t)) {
+      size_t Offset = sizeof(T) / sizeof(uint16_t) * sizeof(uint16_t);
+      ApplyToBytes(Offset, sizeof(uint8_t));
+    }
   }
 }
 
@@ -423,41 +435,118 @@ AtomicMax(multi_ptr<T, AddressSpace> MPtr, ONEAPI::memory_scope Scope,
   return __spirv_AtomicMax(Ptr, SPIRVScope, SPIRVOrder, Value);
 }
 
-// Native shuffles map directly to a SPIR-V SubgroupShuffle intrinsic
+// Native shuffles map directly to a shuffle intrinsic:
+// - The Intel SPIR-V extension natively supports all arithmetic types
+// - The CUDA shfl intrinsics do not support vectors, and we use the _i32
+//   variants for all scalar types
+#ifndef __NVPTX__
 template <typename T>
 using EnableIfNativeShuffle =
     detail::enable_if_t<detail::is_arithmetic<T>::value, T>;
+#else
+template <typename T>
+using EnableIfNativeShuffle = detail::enable_if_t<
+    std::is_integral<T>::value && (sizeof(T) <= sizeof(int32_t)), T>;
+
+template <typename T>
+using EnableIfVectorShuffle =
+    detail::enable_if_t<detail::is_vector_arithmetic<T>::value, T>;
+#endif
+
+#ifdef __NVPTX__
+inline uint32_t membermask() {
+  uint32_t FULL_MASK = 0xFFFFFFFF;
+  uint32_t max_size = __spirv_SubgroupMaxSize();
+  uint32_t sg_size = __spirv_SubgroupSize();
+  return FULL_MASK >> (max_size - sg_size);
+}
+#endif
 
 template <typename T>
 EnableIfNativeShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
+#ifndef __NVPTX__
   using OCLT = detail::ConvertToOpenCLType_t<T>;
   return __spirv_SubgroupShuffleINTEL(OCLT(x),
                                       static_cast<uint32_t>(local_id.get(0)));
+#else
+  return __nvvm_shfl_sync_idx_i32(membermask(), x, local_id.get(0), 0x1f);
+#endif
 }
 
 template <typename T>
 EnableIfNativeShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
+#ifndef __NVPTX__
   using OCLT = detail::ConvertToOpenCLType_t<T>;
   return __spirv_SubgroupShuffleXorINTEL(
       OCLT(x), static_cast<uint32_t>(local_id.get(0)));
+#else
+  return __nvvm_shfl_sync_bfly_i32(membermask(), x, local_id.get(0), 0x1f);
+#endif
 }
 
 template <typename T>
 EnableIfNativeShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
+#ifndef __NVPTX__
   using OCLT = detail::ConvertToOpenCLType_t<T>;
   return __spirv_SubgroupShuffleDownINTEL(
       OCLT(x), OCLT(x), static_cast<uint32_t>(local_id.get(0)));
+#else
+  return __nvvm_shfl_sync_down_i32(membermask(), x, local_id.get(0), 0x1f);
+#endif
 }
 
 template <typename T>
 EnableIfNativeShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
+#ifndef __NVPTX__
   using OCLT = detail::ConvertToOpenCLType_t<T>;
   return __spirv_SubgroupShuffleUpINTEL(OCLT(x), OCLT(x),
                                         static_cast<uint32_t>(local_id.get(0)));
+#else
+  return __nvvm_shfl_sync_up_i32(membermask(), x, local_id.get(0), 0);
+#endif
 }
 
-// Bitcast shuffles can be implemented using a single SPIR-V SubgroupShuffle
+#ifdef __NVPTX__
+template <typename T>
+EnableIfVectorShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
+  T result;
+  for (int s = 0; s < x.get_size(); ++s) {
+    result[s] = SubgroupShuffle(x[s], local_id);
+  }
+  return result;
+}
+
+template <typename T>
+EnableIfVectorShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
+  T result;
+  for (int s = 0; s < x.get_size(); ++s) {
+    result[s] = SubgroupShuffleXor(x[s], local_id);
+  }
+  return result;
+}
+
+template <typename T>
+EnableIfVectorShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
+  T result;
+  for (int s = 0; s < x.get_size(); ++s) {
+    result[s] = SubgroupShuffleDown(x[s], local_id);
+  }
+  return result;
+}
+
+template <typename T>
+EnableIfVectorShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
+  T result;
+  for (int s = 0; s < x.get_size(); ++s) {
+    result[s] = SubgroupShuffleUp(x[s], local_id);
+  }
+  return result;
+}
+#endif
+
+// Bitcast shuffles can be implemented using a single SubgroupShuffle
 // intrinsic, but require type-punning via an appropriate integer type
+#ifndef __NVPTX__
 template <typename T>
 using EnableIfBitcastShuffle =
     detail::enable_if_t<!detail::is_arithmetic<T>::value &&
@@ -465,6 +554,15 @@ using EnableIfBitcastShuffle =
                              (sizeof(T) == 1 || sizeof(T) == 2 ||
                               sizeof(T) == 4 || sizeof(T) == 8)),
                         T>;
+#else
+template <typename T>
+using EnableIfBitcastShuffle = detail::enable_if_t<
+    !(std::is_integral<T>::value && (sizeof(T) <= sizeof(int32_t))) &&
+        !detail::is_vector_arithmetic<T>::value &&
+        (std::is_trivially_copyable<T>::value &&
+         (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4)),
+    T>;
+#endif
 
 template <typename T>
 using ConvertToNativeShuffleType_t = select_cl_scalar_integral_unsigned_t<T>;
@@ -473,8 +571,13 @@ template <typename T>
 EnableIfBitcastShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
   using ShuffleT = ConvertToNativeShuffleType_t<T>;
   auto ShuffleX = detail::bit_cast<ShuffleT>(x);
+#ifndef __NVPTX__
   ShuffleT Result = __spirv_SubgroupShuffleINTEL(
       ShuffleX, static_cast<uint32_t>(local_id.get(0)));
+#else
+  ShuffleT Result =
+      __nvvm_shfl_sync_idx_i32(membermask(), ShuffleX, local_id.get(0), 0x1f);
+#endif
   return detail::bit_cast<T>(Result);
 }
 
@@ -482,8 +585,13 @@ template <typename T>
 EnableIfBitcastShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
   using ShuffleT = ConvertToNativeShuffleType_t<T>;
   auto ShuffleX = detail::bit_cast<ShuffleT>(x);
+#ifndef __NVPTX__
   ShuffleT Result = __spirv_SubgroupShuffleXorINTEL(
       ShuffleX, static_cast<uint32_t>(local_id.get(0)));
+#else
+  ShuffleT Result =
+      __nvvm_shfl_sync_bfly_i32(membermask(), ShuffleX, local_id.get(0), 0x1f);
+#endif
   return detail::bit_cast<T>(Result);
 }
 
@@ -491,8 +599,13 @@ template <typename T>
 EnableIfBitcastShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
   using ShuffleT = ConvertToNativeShuffleType_t<T>;
   auto ShuffleX = detail::bit_cast<ShuffleT>(x);
+#ifndef __NVPTX__
   ShuffleT Result = __spirv_SubgroupShuffleDownINTEL(
       ShuffleX, ShuffleX, static_cast<uint32_t>(local_id.get(0)));
+#else
+  ShuffleT Result =
+      __nvvm_shfl_sync_down_i32(membermask(), ShuffleX, local_id.get(0), 0x1f);
+#endif
   return detail::bit_cast<T>(Result);
 }
 
@@ -500,15 +613,21 @@ template <typename T>
 EnableIfBitcastShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
   using ShuffleT = ConvertToNativeShuffleType_t<T>;
   auto ShuffleX = detail::bit_cast<ShuffleT>(x);
+#ifndef __NVPTX__
   ShuffleT Result = __spirv_SubgroupShuffleUpINTEL(
       ShuffleX, ShuffleX, static_cast<uint32_t>(local_id.get(0)));
+#else
+  ShuffleT Result =
+      __nvvm_shfl_sync_up_i32(membermask(), ShuffleX, local_id.get(0), 0);
+#endif
   return detail::bit_cast<T>(Result);
 }
 
-// Generic shuffles may require multiple calls to SPIR-V SubgroupShuffle
+// Generic shuffles may require multiple calls to SubgroupShuffle
 // intrinsics, and should use the fewest shuffles possible:
 // - Loop over 64-bit chunks until remaining bytes < 64-bit
 // - At most one 32-bit, 16-bit and 8-bit chunk left over
+#ifndef __NVPTX__
 template <typename T>
 using EnableIfGenericShuffle =
     detail::enable_if_t<!detail::is_arithmetic<T>::value &&
@@ -516,6 +635,15 @@ using EnableIfGenericShuffle =
                               (sizeof(T) == 1 || sizeof(T) == 2 ||
                                sizeof(T) == 4 || sizeof(T) == 8)),
                         T>;
+#else
+template <typename T>
+using EnableIfGenericShuffle = detail::enable_if_t<
+    !(std::is_integral<T>::value && (sizeof(T) <= sizeof(int32_t))) &&
+        !detail::is_vector_arithmetic<T>::value &&
+        !(std::is_trivially_copyable<T>::value &&
+          (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4)),
+    T>;
+#endif
 
 template <typename T>
 EnableIfGenericShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
@@ -523,7 +651,7 @@ EnableIfGenericShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
   char *XBytes = reinterpret_cast<char *>(&x);
   char *ResultBytes = reinterpret_cast<char *>(&Result);
   auto ShuffleBytes = [=](size_t Offset, size_t Size) {
-    uint64_t ShuffleX, ShuffleResult;
+    ShuffleChunkT ShuffleX, ShuffleResult;
     detail::memcpy(&ShuffleX, XBytes + Offset, Size);
     ShuffleResult = SubgroupShuffle(ShuffleX, local_id);
     detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
@@ -538,7 +666,7 @@ EnableIfGenericShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
   char *XBytes = reinterpret_cast<char *>(&x);
   char *ResultBytes = reinterpret_cast<char *>(&Result);
   auto ShuffleBytes = [=](size_t Offset, size_t Size) {
-    uint64_t ShuffleX, ShuffleResult;
+    ShuffleChunkT ShuffleX, ShuffleResult;
     detail::memcpy(&ShuffleX, XBytes + Offset, Size);
     ShuffleResult = SubgroupShuffleXor(ShuffleX, local_id);
     detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
@@ -553,7 +681,7 @@ EnableIfGenericShuffle<T> SubgroupShuffleDown(T x, id<1> local_id) {
   char *XBytes = reinterpret_cast<char *>(&x);
   char *ResultBytes = reinterpret_cast<char *>(&Result);
   auto ShuffleBytes = [=](size_t Offset, size_t Size) {
-    uint64_t ShuffleX, ShuffleResult;
+    ShuffleChunkT ShuffleX, ShuffleResult;
     detail::memcpy(&ShuffleX, XBytes + Offset, Size);
     ShuffleResult = SubgroupShuffleDown(ShuffleX, local_id);
     detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
@@ -568,7 +696,7 @@ EnableIfGenericShuffle<T> SubgroupShuffleUp(T x, id<1> local_id) {
   char *XBytes = reinterpret_cast<char *>(&x);
   char *ResultBytes = reinterpret_cast<char *>(&Result);
   auto ShuffleBytes = [=](size_t Offset, size_t Size) {
-    uint64_t ShuffleX, ShuffleResult;
+    ShuffleChunkT ShuffleX, ShuffleResult;
     detail::memcpy(&ShuffleX, XBytes + Offset, Size);
     ShuffleResult = SubgroupShuffleUp(ShuffleX, local_id);
     detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);

From e9d7cc279b7a0a974e10208483b7e88cac8af859 Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Fri, 9 Oct 2020 12:13:40 -0400
Subject: [PATCH 2/2] [SYCL][CUDA] Enable sub-group shuffle tests

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 sycl/test/sub_group/generic-shuffle.cpp |  5 +----
 sycl/test/sub_group/shuffle.cpp         | 11 +++--------
 sycl/test/sub_group/shuffle_fp16.cpp    |  5 +----
 sycl/test/sub_group/shuffle_fp64.cpp    |  5 +----
 4 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/sycl/test/sub_group/generic-shuffle.cpp b/sycl/test/sub_group/generic-shuffle.cpp
index e6825750925fc..c9ece6143be97 100644
--- a/sycl/test/sub_group/generic-shuffle.cpp
+++ b/sycl/test/sub_group/generic-shuffle.cpp
@@ -1,6 +1,3 @@
-// UNSUPPORTED: cuda
-// CUDA compilation and runtime do not yet support sub-groups.
-//
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
@@ -216,7 +213,7 @@ void check_struct(queue &Queue, Generator &Gen, size_t G = 256, size_t L = 64) {
 
 int main() {
   queue Queue;
-  if (!Queue.get_device().has_extension("cl_intel_subgroups")) {
+  if (Queue.get_device().is_host()) {
     std::cout << "Skipping test\n";
     return 0;
   }
diff --git a/sycl/test/sub_group/shuffle.cpp b/sycl/test/sub_group/shuffle.cpp
index 5207716148ef6..6d551a92bde7d 100644
--- a/sycl/test/sub_group/shuffle.cpp
+++ b/sycl/test/sub_group/shuffle.cpp
@@ -1,6 +1,3 @@
-// UNSUPPORTED: cuda
-// CUDA compilation and runtime do not yet support sub-groups.
-//
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
@@ -19,14 +16,12 @@
 
 int main() {
   queue Queue;
-  if (!Queue.get_device().has_extension("cl_intel_subgroups")) {
+  if (Queue.get_device().is_host()) {
     std::cout << "Skipping test\n";
     return 0;
   }
-  if (Queue.get_device().has_extension("cl_intel_subgroups_short")) {
-    check<short>(Queue);
-    check<unsigned short>(Queue);
-  }
+  check<short>(Queue);
+  check<unsigned short>(Queue);
   check<int>(Queue);
   check<int, 2>(Queue);
   check<int, 4>(Queue);
diff --git a/sycl/test/sub_group/shuffle_fp16.cpp b/sycl/test/sub_group/shuffle_fp16.cpp
index 62f07fc612de8..ac0863a897fee 100644
--- a/sycl/test/sub_group/shuffle_fp16.cpp
+++ b/sycl/test/sub_group/shuffle_fp16.cpp
@@ -1,6 +1,3 @@
-// UNSUPPORTED: cuda
-// CUDA compilation and runtime do not yet support sub-groups.
-//
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 //
@@ -16,7 +13,7 @@
 
 int main() {
   queue Queue;
-  if (!Queue.get_device().has_extension("cl_intel_subgroups")) {
+  if (Queue.get_device().is_host()) {
     std::cout << "Skipping test\n";
     return 0;
   }
diff --git a/sycl/test/sub_group/shuffle_fp64.cpp b/sycl/test/sub_group/shuffle_fp64.cpp
index 3b1ed56907601..a1b153b4b8e36 100644
--- a/sycl/test/sub_group/shuffle_fp64.cpp
+++ b/sycl/test/sub_group/shuffle_fp64.cpp
@@ -1,6 +1,3 @@
-// UNSUPPORTED: cuda
-// CUDA compilation and runtime do not yet support sub-groups.
-//
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
@@ -19,7 +16,7 @@
 
 int main() {
   queue Queue;
-  if (!Queue.get_device().has_extension("cl_intel_subgroups")) {
+  if (Queue.get_device().is_host()) {
     std::cout << "Skipping test\n";
     return 0;
   }