Skip to content
65 changes: 65 additions & 0 deletions libclc/amdgcn-amdhsa/libspirv/group/collectives.cl
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,11 @@ __CLC_DECLARE_SHUFFLES(double, d);
#define __CLC_MIN(x, y) ((x < y) ? (x) : (y))
#define __CLC_MAX(x, y) ((x > y) ? (x) : (y))
#define __CLC_OR(x, y) (x | y)
#define __CLC_XOR(x, y) (x ^ y)
#define __CLC_AND(x, y) (x & y)
#define __CLC_MUL(x, y) (x * y)
#define __CLC_LOGICAL_OR(x, y) (x || y)
#define __CLC_LOGICAL_AND(x, y) (x && y)

#define __CLC_SUBGROUP_COLLECTIVE_BODY(OP, TYPE, TYPE_MANGLED, IDENTITY) \
uint sg_lid = __spirv_SubgroupLocalInvocationId(); \
Expand Down Expand Up @@ -152,6 +155,37 @@ __CLC_SUBGROUP_COLLECTIVE(FMax, __CLC_MAX, double, d, -INFINITY)
__CLC_SUBGROUP_COLLECTIVE(All, __CLC_AND, bool, a, true)
__CLC_SUBGROUP_COLLECTIVE(Any, __CLC_OR, bool, a, false)

__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, uchar, h, ~0)
__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, uchar, h, 0)
__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, uchar, h, 0)
__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, char, a, ~0)
__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, char, a, 0)
__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, char, a, 0)

__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, ushort, t, ~0)
__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, ushort, t, 0)
__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, ushort, t, 0)
__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, short, s, ~0)
__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, short, s, 0)
__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, short, s, 0)

__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, uint, j, ~0)
__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, uint, j, 0)
__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, uint, j, 0)
__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, int, i, ~0)
__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, int, i, 0)
__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, int, i, 0)

__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, ulong, m, ~0l)
__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, ulong, m, 0l)
__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, ulong, m, 0l)
__CLC_SUBGROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, long, l, ~0l)
__CLC_SUBGROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, long, l, 0l)
__CLC_SUBGROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, long, l, 0l)

__CLC_SUBGROUP_COLLECTIVE(LogicalOrKHR, __CLC_LOGICAL_OR, bool, a, false)
__CLC_SUBGROUP_COLLECTIVE(LogicalAndKHR, __CLC_LOGICAL_AND, bool, a, true)

#undef __CLC_SUBGROUP_COLLECTIVE_BODY
#undef __CLC_SUBGROUP_COLLECTIVE

Expand Down Expand Up @@ -267,6 +301,37 @@ __CLC_GROUP_COLLECTIVE(UMax, __CLC_MAX, ulong, 0)
__CLC_GROUP_COLLECTIVE(FMax, __CLC_MAX, float, -INFINITY)
__CLC_GROUP_COLLECTIVE(FMax, __CLC_MAX, double, -INFINITY)

__CLC_GROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, uchar, ~0)
__CLC_GROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, uchar, 0)
__CLC_GROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, uchar, 0)
__CLC_GROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, char, ~0)
__CLC_GROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, char, 0)
__CLC_GROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, char, 0)

__CLC_GROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, ushort, ~0)
__CLC_GROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, ushort, 0)
__CLC_GROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, ushort, 0)
__CLC_GROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, short, ~0)
__CLC_GROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, short, 0)
__CLC_GROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, short, 0)

__CLC_GROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, uint, ~0)
__CLC_GROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, uint, 0)
__CLC_GROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, uint, 0)
__CLC_GROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, int, ~0)
__CLC_GROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, int, 0)
__CLC_GROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, int, 0)

__CLC_GROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, ulong, ~0l)
__CLC_GROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, ulong, 0l)
__CLC_GROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, ulong, 0l)
__CLC_GROUP_COLLECTIVE(BitwiseAndKHR, __CLC_AND, long, ~0l)
__CLC_GROUP_COLLECTIVE(BitwiseOrKHR, __CLC_OR, long, 0l)
__CLC_GROUP_COLLECTIVE(BitwiseXorKHR, __CLC_XOR, long, 0l)

__CLC_GROUP_COLLECTIVE(LogicalOrKHR, __CLC_LOGICAL_OR, bool, false)
__CLC_GROUP_COLLECTIVE(LogicalAndKHR, __CLC_LOGICAL_AND, bool, true)

#undef __CLC_GROUP_COLLECTIVE_4
#undef __CLC_GROUP_COLLECTIVE_5
#undef DISPATCH_TO_CLC_GROUP_COLLECTIVE_MACRO
Expand Down
7 changes: 3 additions & 4 deletions libclc/amdgcn-amdhsa/libspirv/misc/sub_group_shuffle.cl
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,15 @@

#include <spirv/spirv.h>

#define SELF __builtin_amdgcn_mbcnt_hi(-1, __builtin_amdgcn_mbcnt_lo(-1, 0))
#define SELF __spirv_SubgroupLocalInvocationId();
#define SUBGROUP_SIZE __spirv_SubgroupMaxSize()

// Shuffle
// int __spirv_SubgroupShuffleINTEL<int>(int, unsigned int)
_CLC_DEF int
_Z28__spirv_SubgroupShuffleINTELIiET_S0_j(int Data, unsigned int InvocationId) {
int self = SELF;
int index = InvocationId + (self & ~(SUBGROUP_SIZE - 1));
return __builtin_amdgcn_ds_bpermute(index << 2, Data);
int Index = InvocationId;
return __builtin_amdgcn_ds_bpermute(Index << 2, Data);
}

// Sub 32-bit types.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,5 @@
#include <spirv/spirv.h>

_CLC_DEF _CLC_OVERLOAD uint __spirv_SubgroupLocalInvocationId() {
size_t id_x = __spirv_LocalInvocationId_x();
size_t id_y = __spirv_LocalInvocationId_y();
size_t id_z = __spirv_LocalInvocationId_z();
size_t size_x = __spirv_WorkgroupSize_x();
size_t size_y = __spirv_WorkgroupSize_y();
uint sg_size = __spirv_SubgroupMaxSize();
return (id_z * size_y * size_x + id_y * size_x + id_x) % sg_size;
return __builtin_amdgcn_mbcnt_hi(-1, __builtin_amdgcn_mbcnt_lo(-1, 0));
}
32 changes: 16 additions & 16 deletions sycl/include/sycl/sub_group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,9 +277,9 @@ struct sub_group {
load(CVT *cv_src) const {
T *src = const_cast<T *>(cv_src);

#ifdef __NVPTX__
#if defined(__NVPTX__) || defined(__AMDGCN__)
return src[get_local_id()[0]];
#else // __NVPTX__
#else // __NVPTX__ || __AMDGCN__
auto l = __SYCL_GenericCastToPtrExplicit_ToLocal<T>(src);
if (l)
return load(l);
Expand All @@ -290,7 +290,7 @@ struct sub_group {

assert(!"Sub-group load() is supported for local or global pointers only.");
return {};
#endif // __NVPTX__
#endif // __NVPTX__ || __AMDGCN__
}
#else //__SYCL_DEVICE_ONLY__
template <typename CVT, typename T = std::remove_cv_t<CVT>>
Expand All @@ -309,16 +309,16 @@ struct sub_group {
multi_ptr<T, Space, IsDecorated> src =
sycl::detail::GetUnqualMultiPtr(cv_src);
#ifdef __SYCL_DEVICE_ONLY__
#ifdef __NVPTX__
#if defined(__NVPTX__) || defined(__AMDGCN__)
return src.get()[get_local_id()[0]];
#else
return sycl::detail::sub_group::load(src);
#endif // __NVPTX__
#endif // __NVPTX__ || __AMDGCN__
#else
(void)src;
throw runtime_error("Sub-groups are not supported on host device.",
PI_ERROR_INVALID_DEVICE);
#endif
#endif // __NVPTX__ || __AMDGCN__
}

template <typename CVT, access::address_space Space,
Expand All @@ -337,7 +337,7 @@ struct sub_group {
#endif
}
#ifdef __SYCL_DEVICE_ONLY__
#ifdef __NVPTX__
#if defined(__NVPTX__) || defined(__AMDGCN__)
template <int N, typename CVT, access::address_space Space,
access::decorated IsDecorated, typename T = std::remove_cv_t<CVT>>
std::enable_if_t<
Expand All @@ -352,7 +352,7 @@ struct sub_group {
}
return res;
}
#else // __NVPTX__
#else // __NVPTX__ || __AMDGCN__
template <int N, typename CVT, access::address_space Space,
access::decorated IsDecorated, typename T = std::remove_cv_t<CVT>>
std::enable_if_t<
Expand Down Expand Up @@ -455,9 +455,9 @@ struct sub_group {
std::enable_if_t<std::is_same<remove_decoration_t<T>, T>::value>
store(T *dst, const remove_decoration_t<T> &x) const {

#ifdef __NVPTX__
#if defined(__NVPTX__) || defined(__AMDGCN__)
dst[get_local_id()[0]] = x;
#else // __NVPTX__
#else // __NVPTX__ || __AMDGCN__
auto l = __SYCL_GenericCastToPtrExplicit_ToLocal<T>(dst);
if (l) {
store(l, x);
Expand All @@ -473,7 +473,7 @@ struct sub_group {
assert(
!"Sub-group store() is supported for local or global pointers only.");
return;
#endif // __NVPTX__
#endif // __NVPTX__ || __AMDGCN__
}
#else //__SYCL_DEVICE_ONLY__
template <typename T> void store(T *dst, const T &x) const {
Expand All @@ -490,11 +490,11 @@ struct sub_group {
sycl::detail::sub_group::AcceptableForGlobalLoadStore<T, Space>::value>
store(multi_ptr<T, Space, DecorateAddress> dst, const T &x) const {
#ifdef __SYCL_DEVICE_ONLY__
#ifdef __NVPTX__
#if defined(__NVPTX__) || defined(__AMDGCN__)
dst.get()[get_local_id()[0]] = x;
#else
sycl::detail::sub_group::store(dst, x);
#endif // __NVPTX__
#endif // __NVPTX__ || __AMDGCN__
#else
(void)dst;
(void)x;
Expand All @@ -519,7 +519,7 @@ struct sub_group {
}

#ifdef __SYCL_DEVICE_ONLY__
#ifdef __NVPTX__
#if defined(__NVPTX__) || defined(__AMDGCN__)
template <int N, typename T, access::address_space Space,
access::decorated DecorateAddress>
std::enable_if_t<
Expand All @@ -529,7 +529,7 @@ struct sub_group {
*(dst.get() + i * get_max_local_range()[0] + get_local_id()[0]) = x[i];
}
}
#else // __NVPTX__
#else // __NVPTX__ || __AMDGCN__
template <int N, typename T, access::address_space Space,
access::decorated DecorateAddress>
std::enable_if_t<
Expand Down Expand Up @@ -570,7 +570,7 @@ struct sub_group {
x.hi());
}

#endif // __NVPTX__
#endif // __NVPTX__ || __AMDGCN__
#else // __SYCL_DEVICE_ONLY__
template <int N, typename T, access::address_space Space,
access::decorated DecorateAddress>
Expand Down
4 changes: 0 additions & 4 deletions sycl/test-e2e/GroupAlgorithm/SYCL2020/exclusive_scan.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
// RUN: %{build} -I . -o %t.out
// RUN: %{run} %t.out

// disabling hip because some of the binary_ops tested are not supported
// getting undefined symbols for a handful of __spirv__ * functions.
// XFAIL: hip

#include "support.h"
#include <algorithm>
#include <cassert>
Expand Down
4 changes: 0 additions & 4 deletions sycl/test-e2e/GroupAlgorithm/SYCL2020/inclusive_scan.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
// RUN: %{build} -I . -o %t.out
// RUN: %{run} %t.out

// disabling hip because some of the binary_ops tested are not supported
// getting undefined symbols for a handful of __spirv__ * functions.
// XFAIL: hip

#include "support.h"
#include <algorithm>
#include <cassert>
Expand Down
4 changes: 0 additions & 4 deletions sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
// RUN: %{build} -I . -o %t.out
// RUN: %{run} %t.out

// disabling hip because some of the binary_ops tested are not supported
// getting undefined symbols for a handful of __spirv__ * functions.
// XFAIL: hip

#include "support.h"
#include <algorithm>
#include <cassert>
Expand Down
4 changes: 0 additions & 4 deletions sycl/test-e2e/SubGroup/load_store.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
// RUN: %{run} %t.out
//
// Missing __spirv_SubgroupBlockReadINTEL, __spirv_SubgroupBlockWriteINTEL on
// AMD
// XFAIL: hip_amd
//
//==----------- load_store.cpp - SYCL sub_group load/store test ------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
Expand Down
2 changes: 0 additions & 2 deletions sycl/test-e2e/SubGroup/reduce_spirv13.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
// UNSUPPORTED: hip

// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

Expand Down
2 changes: 0 additions & 2 deletions sycl/test-e2e/SubGroup/scan_spirv13.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
// UNSUPPORTED: hip

// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

Expand Down
1 change: 1 addition & 0 deletions sycl/test-e2e/SubGroup/shuffle_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// UNSUPPORTED: hip
// Even though `gfx908` and `gfx906` support halfs, libspirv is currently
// built with `tahiti` as the target CPU, which means that clang rejects
// AMD built-ins using halfs, for that reason half support has to stay
Expand Down
10 changes: 2 additions & 8 deletions sycl/test-e2e/SubGroup/sub_group_as.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,6 @@
// RUN: %{build} -DUSE_DEPRECATED_LOCAL_ACC -o %t.out -Wno-deprecated-declarations
// RUN: %{run} %t.out
//
// Missing __spirv_GenericCastToPtrExplicit_ToLocal,
// __spirv_SubgroupInvocationId, __spirv_GenericCastToPtrExplicit_ToGlobal,
// __spirv_SubgroupBlockReadINTEL, __assert_fail,
// __spirv_SubgroupBlockWriteINTEL on AMD
// error message `Barrier is not supported on the host device yet.` on Nvidia.
// XFAIL: hip_amd || hip_nvidia
// UNSUPPORTED: ze_debug

#include <cassert>
Expand Down Expand Up @@ -55,8 +49,8 @@ int main(int argc, char *argv[]) {
}
it.barrier();

int i = (it.get_global_id(0) / sg.get_max_local_range()[0]) *
sg.get_max_local_range()[0];
int i = (it.get_global_id(0) / sg.get_local_range()[0]) *
sg.get_local_range()[0];
// Global address space
auto x = sg.load(&global[i]);
auto x_cv = sg.load<const volatile int>(&global[i]);
Expand Down
10 changes: 2 additions & 8 deletions sycl/test-e2e/SubGroup/sub_group_as_vec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,6 @@
// RUN: %{build} -DUSE_DEPRECATED_LOCAL_ACC -o %t.out
// RUN: %{run} %t.out
//
// Missing __spirv_GenericCastToPtrExplicit_ToLocal,
// __spirv_SubgroupLocalInvocationId, __spirv_GenericCastToPtrExplicit_ToGlobal,
// __spirv_SubgroupBlockReadINTEL, __assert_fail,
// __spirv_SubgroupBlockWriteINTEL on AMD
// error message `Barrier is not supported on the host device yet.` on Nvidia.
// XFAIL: hip_amd || hip_nvidia
// UNSUPPORTED: ze_debug

#include "helper.hpp"
Expand Down Expand Up @@ -57,8 +51,8 @@ int main(int argc, char *argv[]) {
}
it.barrier();

int i = (it.get_global_id(0) / sg.get_max_local_range()[0]) *
sg.get_max_local_range()[0];
int i = (it.get_global_id(0) / sg.get_local_range()[0]) *
sg.get_local_range()[0];
// Global address space
auto x = sg.load(&global[i]);
auto x_cv1 = sg.load<const volatile sycl::int2>(&global[i]);
Expand Down