intel · dm-vodopyanov · Aug 17, 2023 · Jun 21, 2023 · Jun 22, 2023 · Jun 22, 2023
@@ -64,8 +64,11 @@ __CLC_DECLARE_SHUFFLES(double, d);
 #define __CLC_MIN(x, y) ((x < y) ? (x) : (y))
 #define __CLC_MAX(x, y) ((x > y) ? (x) : (y))
 #define __CLC_OR(x, y) (x | y)
+#define __CLC_XOR(x, y) (x ^ y)
 #define __CLC_AND(x, y) (x & y)
 #define __CLC_MUL(x, y) (x * y)
+#define __CLC_LOGICAL_OR(x, y) (x || y)
+#define __CLC_LOGICAL_AND(x, y) (x && y)
 
 #define __CLC_SUBGROUP_COLLECTIVE_BODY(OP, TYPE, TYPE_MANGLED, IDENTITY)       \
   uint sg_lid = __spirv_SubgroupLocalInvocationId();                           \
@@ -152,6 +155,37 @@ __CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, double, d, -INFINITY)
 __CLC_SUBGROUP_COLLECTIVE(All, __CLC_AND, bool, a, true)
 __CLC_SUBGROUP_COLLECTIVE(Any, __CLC_OR, bool, a, false)
 
+__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, uchar, h, ~0)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, uchar, h, 0)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, uchar, h, 0)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, char, a, ~0)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, char, a, 0)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, char, a, 0)
+
+__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, ushort, t, ~0)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, ushort, t, 0)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, ushort, t, 0)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, short, s, ~0)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, short, s, 0)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, short, s, 0)
+
+__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, uint, j, ~0)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, uint, j, 0)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, uint, j, 0)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, int, i, ~0)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, int, i, 0)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, int, i, 0)
+
+__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, ulong, m, ~0l)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, ulong, m, 0l)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, ulong, m, 0l)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, long, l, ~0l)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, long, l, 0l)
+__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, long, l, 0l)
+
+__CLC_SUBGROUP_COLLECTIVE(LogicalOrKHR, __CLC_LOGICAL_OR, bool, a, false)
+__CLC_SUBGROUP_COLLECTIVE(LogicalAndKHR, __CLC_LOGICAL_AND, bool, a, true)
+
 #undef __CLC_SUBGROUP_COLLECTIVE_BODY
 #undef __CLC_SUBGROUP_COLLECTIVE
 
@@ -267,6 +301,37 @@ __CLC_GROUP_COLLECTIVE(UMax, __CLC_MAX, ulong, 0)
 __CLC_GROUP_COLLECTIVE(FMax, __CLC_MAX, float, -INFINITY)
 __CLC_GROUP_COLLECTIVE(FMax, __CLC_MAX, double, -INFINITY)
 
+__CLC_GROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, uchar, ~0)
+__CLC_GROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, uchar, 0)
+__CLC_GROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, uchar, 0)
+__CLC_GROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, char, ~0)
+__CLC_GROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, char, 0)
+__CLC_GROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, char, 0)
+
+__CLC_GROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, ushort, ~0)
+__CLC_GROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, ushort, 0)
+__CLC_GROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, ushort, 0)
+__CLC_GROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, short, ~0)
+__CLC_GROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, short, 0)
+__CLC_GROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, short, 0)
+
+__CLC_GROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, uint, ~0)
+__CLC_GROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, uint, 0)
+__CLC_GROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, uint, 0)
+__CLC_GROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, int, ~0)
+__CLC_GROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, int, 0)
+__CLC_GROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, int, 0)
+
+__CLC_GROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, ulong, ~0l)
+__CLC_GROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, ulong, 0l)
+__CLC_GROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, ulong, 0l)
+__CLC_GROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, long, ~0l)
+__CLC_GROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, long, 0l)
+__CLC_GROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, long, 0l)
+
+__CLC_GROUP_COLLECTIVE(LogicalOrKHR, __CLC_LOGICAL_OR, bool, false)
+__CLC_GROUP_COLLECTIVE(LogicalAndKHR, __CLC_LOGICAL_AND, bool, true)
+
 #undef __CLC_GROUP_COLLECTIVE_4
 #undef __CLC_GROUP_COLLECTIVE_5
 #undef DISPATCH_TO_CLC_GROUP_COLLECTIVE_MACRO

@@ -8,16 +8,15 @@
 
 #include <spirv/spirv.h>
 
-#define SELF __builtin_amdgcn_mbcnt_hi(-1, __builtin_amdgcn_mbcnt_lo(-1, 0))
+#define SELF __spirv_SubgroupLocalInvocationId();
 #define SUBGROUP_SIZE __spirv_SubgroupMaxSize()
 
 // Shuffle
 // int __spirv_SubgroupShuffleINTEL<int>(int, unsigned int)
 _CLC_DEF int
 _Z28__spirv_SubgroupShuffleINTELIiET_S0_j(int Data, unsigned int InvocationId) {
-  int self = SELF;
-  int index = InvocationId + (self & ~(SUBGROUP_SIZE - 1));
-  return __builtin_amdgcn_ds_bpermute(index << 2, Data);
+  int Index = InvocationId;
+  return __builtin_amdgcn_ds_bpermute(Index << 2, Data);
 }
 
 // Sub 32-bit types.

@@ -9,11 +9,5 @@
 #include <spirv/spirv.h>
 
 _CLC_DEF _CLC_OVERLOAD uint __spirv_SubgroupLocalInvocationId() {
-  size_t id_x = __spirv_LocalInvocationId_x();
-  size_t id_y = __spirv_LocalInvocationId_y();
-  size_t id_z = __spirv_LocalInvocationId_z();
-  size_t size_x = __spirv_WorkgroupSize_x();
-  size_t size_y = __spirv_WorkgroupSize_y();
-  uint sg_size = __spirv_SubgroupMaxSize();
-  return (id_z * size_y * size_x + id_y * size_x + id_x) % sg_size;
+  return __builtin_amdgcn_mbcnt_hi(-1, __builtin_amdgcn_mbcnt_lo(-1, 0));
 }
@@ -277,9 +277,9 @@ struct sub_group {
   load(CVT *cv_src) const {
     T *src = const_cast<T *>(cv_src);
 
-#ifdef __NVPTX__
+#if defined(__NVPTX__) || defined(__AMDGCN__)
     return src[get_local_id()[0]];
-#else  // __NVPTX__
+#else  // __NVPTX__ || __AMDGCN__
     auto l = __SYCL_GenericCastToPtrExplicit_ToLocal<T>(src);
     if (l)
       return load(l);
@@ -290,7 +290,7 @@ struct sub_group {
 
     assert(!"Sub-group load() is supported for local or global pointers only.");
     return {};
-#endif // __NVPTX__
+#endif // __NVPTX__ || __AMDGCN__
   }
 #else  //__SYCL_DEVICE_ONLY__
   template <typename CVT, typename T = std::remove_cv_t<CVT>>
@@ -309,16 +309,16 @@ struct sub_group {
     multi_ptr<T, Space, IsDecorated> src =
         sycl::detail::GetUnqualMultiPtr(cv_src);
 #ifdef __SYCL_DEVICE_ONLY__
-#ifdef __NVPTX__
+#if defined(__NVPTX__) || defined(__AMDGCN__)
     return src.get()[get_local_id()[0]];
 #else
     return sycl::detail::sub_group::load(src);
-#endif // __NVPTX__
+#endif // __NVPTX__ || __AMDGCN__
 #else
     (void)src;
     throw runtime_error("Sub-groups are not supported on host device.",
                         PI_ERROR_INVALID_DEVICE);
-#endif
+#endif // __NVPTX__ || __AMDGCN__
   }
 
   template <typename CVT, access::address_space Space,
@@ -337,7 +337,7 @@ struct sub_group {
 #endif
   }
 #ifdef __SYCL_DEVICE_ONLY__
-#ifdef __NVPTX__
+#if defined(__NVPTX__) || defined(__AMDGCN__)
   template <int N, typename CVT, access::address_space Space,
             access::decorated IsDecorated, typename T = std::remove_cv_t<CVT>>
   std::enable_if_t<
@@ -352,7 +352,7 @@ struct sub_group {
     }
     return res;
   }
-#else  // __NVPTX__
+#else // __NVPTX__ || __AMDGCN__
   template <int N, typename CVT, access::address_space Space,
             access::decorated IsDecorated, typename T = std::remove_cv_t<CVT>>
   std::enable_if_t<
@@ -455,9 +455,9 @@ struct sub_group {
   std::enable_if_t<std::is_same<remove_decoration_t<T>, T>::value>
   store(T *dst, const remove_decoration_t<T> &x) const {
 
-#ifdef __NVPTX__
+#if defined(__NVPTX__) || defined(__AMDGCN__)
     dst[get_local_id()[0]] = x;
-#else  // __NVPTX__
+#else  // __NVPTX__ || __AMDGCN__
     auto l = __SYCL_GenericCastToPtrExplicit_ToLocal<T>(dst);
     if (l) {
       store(l, x);
@@ -473,7 +473,7 @@ struct sub_group {
     assert(
         !"Sub-group store() is supported for local or global pointers only.");
     return;
-#endif // __NVPTX__
+#endif // __NVPTX__ || __AMDGCN__
   }
 #else  //__SYCL_DEVICE_ONLY__
   template <typename T> void store(T *dst, const T &x) const {
@@ -490,11 +490,11 @@ struct sub_group {
       sycl::detail::sub_group::AcceptableForGlobalLoadStore<T, Space>::value>
   store(multi_ptr<T, Space, DecorateAddress> dst, const T &x) const {
 #ifdef __SYCL_DEVICE_ONLY__
-#ifdef __NVPTX__
+#if defined(__NVPTX__) || defined(__AMDGCN__)
     dst.get()[get_local_id()[0]] = x;
 #else
     sycl::detail::sub_group::store(dst, x);
-#endif // __NVPTX__
+#endif // __NVPTX__ || __AMDGCN__
 #else
     (void)dst;
     (void)x;
@@ -519,7 +519,7 @@ struct sub_group {
   }
 
 #ifdef __SYCL_DEVICE_ONLY__
-#ifdef __NVPTX__
+#if defined(__NVPTX__) || defined(__AMDGCN__)
   template <int N, typename T, access::address_space Space,
             access::decorated DecorateAddress>
   std::enable_if_t<
@@ -529,7 +529,7 @@ struct sub_group {
       *(dst.get() + i * get_max_local_range()[0] + get_local_id()[0]) = x[i];
     }
   }
-#else // __NVPTX__
+#else // __NVPTX__ || __AMDGCN__
   template <int N, typename T, access::address_space Space,
             access::decorated DecorateAddress>
   std::enable_if_t<
@@ -570,7 +570,7 @@ struct sub_group {
                                         x.hi());
   }
 
-#endif // __NVPTX__
+#endif // __NVPTX__ || __AMDGCN__
 #else  // __SYCL_DEVICE_ONLY__
   template <int N, typename T, access::address_space Space,
             access::decorated DecorateAddress>

@@ -1,10 +1,6 @@
 // RUN: %{build} -I . -o %t.out
 // RUN: %{run} %t.out
 
-// disabling hip because some of the binary_ops tested are not supported
-// getting undefined symbols for a handful of __spirv__ * functions.
-// XFAIL: hip
-
 #include "support.h"
 #include <algorithm>
 #include <cassert>

@@ -1,10 +1,6 @@
 // RUN: %{build} -I . -o %t.out
 // RUN: %{run} %t.out
 
-// disabling hip because some of the binary_ops tested are not supported
-// getting undefined symbols for a handful of __spirv__ * functions.
-// XFAIL: hip
-
 #include "support.h"
 #include <algorithm>
 #include <cassert>

@@ -1,10 +1,6 @@
 // RUN: %{build} -I . -o %t.out
 // RUN: %{run} %t.out
 
-// disabling hip because some of the binary_ops tested are not supported
-// getting undefined symbols for a handful of __spirv__ * functions.
-// XFAIL: hip
-
 #include "support.h"
 #include <algorithm>
 #include <cassert>

@@ -1,10 +1,6 @@
 // RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
 //
-// Missing __spirv_SubgroupBlockReadINTEL, __spirv_SubgroupBlockWriteINTEL on
-// AMD
-// XFAIL: hip_amd
-//
 //==----------- load_store.cpp - SYCL sub_group load/store test ------------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

@@ -1,5 +1,3 @@
-// UNSUPPORTED: hip
-
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 

@@ -1,5 +1,3 @@
-// UNSUPPORTED: hip
-
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 

@@ -4,6 +4,7 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
+// UNSUPPORTED: hip
 // Even though `gfx908` and `gfx906` support halfs, libspirv is currently
 // built with `tahiti` as the target CPU, which means that clang rejects
 // AMD built-ins using halfs, for that reason half support has to stay

@@ -4,12 +4,6 @@
 // RUN: %{build} -DUSE_DEPRECATED_LOCAL_ACC -o %t.out -Wno-deprecated-declarations
 // RUN: %{run} %t.out
 //
-// Missing __spirv_GenericCastToPtrExplicit_ToLocal,
-// __spirv_SubgroupInvocationId, __spirv_GenericCastToPtrExplicit_ToGlobal,
-// __spirv_SubgroupBlockReadINTEL, __assert_fail,
-// __spirv_SubgroupBlockWriteINTEL on AMD
-// error message `Barrier is not supported on the host device yet.` on Nvidia.
-// XFAIL: hip_amd || hip_nvidia
 // UNSUPPORTED: ze_debug
 
 #include <cassert>
@@ -55,8 +49,8 @@ int main(int argc, char *argv[]) {
             }
             it.barrier();
 
-            int i = (it.get_global_id(0) / sg.get_max_local_range()[0]) *
-                    sg.get_max_local_range()[0];
+            int i = (it.get_global_id(0) / sg.get_local_range()[0]) *
+                    sg.get_local_range()[0];
             // Global address space
             auto x = sg.load(&global[i]);
             auto x_cv = sg.load<const volatile int>(&global[i]);

@@ -4,12 +4,6 @@
 // RUN: %{build} -DUSE_DEPRECATED_LOCAL_ACC -o %t.out
 // RUN: %{run} %t.out
 //
-// Missing  __spirv_GenericCastToPtrExplicit_ToLocal,
-// __spirv_SubgroupLocalInvocationId, __spirv_GenericCastToPtrExplicit_ToGlobal,
-// __spirv_SubgroupBlockReadINTEL, __assert_fail,
-// __spirv_SubgroupBlockWriteINTEL on AMD
-// error message `Barrier is not supported on the host device yet.` on Nvidia.
-// XFAIL: hip_amd || hip_nvidia
 // UNSUPPORTED: ze_debug
 
 #include "helper.hpp"
@@ -57,8 +51,8 @@ int main(int argc, char *argv[]) {
             }
             it.barrier();
 
-            int i = (it.get_global_id(0) / sg.get_max_local_range()[0]) *
-                    sg.get_max_local_range()[0];
+            int i = (it.get_global_id(0) / sg.get_local_range()[0]) *
+                    sg.get_local_range()[0];
             // Global address space
             auto x = sg.load(&global[i]);
             auto x_cv1 = sg.load<const volatile sycl::int2>(&global[i]);