Skip to content

Commit 281f0e0

Browse files
author
Gang Y Chen
committed
[SYCL][ESIMD] reduce restriction on some esimd API based upon user feedback
- allow 256-byte block load for slm - allow vector-length of 1/2/4/8/16/32 for gather and scatter Signed-off-by: Gang Y Chen <[email protected]>
1 parent 304067c commit 281f0e0

File tree

3 files changed

+40
-11
lines changed

3 files changed

+40
-11
lines changed

sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ constexpr unsigned int ElemsPerAddrEncoding() {
8484
template <typename T, int n, int ElemsPerAddr = 1,
8585
CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
8686
ESIMD_INLINE ESIMD_NODEBUG
87-
typename std::enable_if<((n == 8 || n == 16 || n == 32) &&
87+
typename std::enable_if<(__esimd::isPowerOf2(n, 32) &&
8888
(ElemsPerAddr == 1 || ElemsPerAddr == 2 ||
8989
ElemsPerAddr == 4)),
9090
simd<T, n * ElemsPerAddr>>::type
@@ -120,7 +120,7 @@ ESIMD_INLINE ESIMD_NODEBUG
120120
template <typename T, int n, int ElemsPerAddr = 1,
121121
CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
122122
ESIMD_INLINE ESIMD_NODEBUG
123-
typename std::enable_if<((n == 8 || n == 16 || n == 32) &&
123+
typename std::enable_if<(__esimd::isPowerOf2(n, 32) &&
124124
(ElemsPerAddr == 1 || ElemsPerAddr == 2 ||
125125
ElemsPerAddr == 4)),
126126
void>::type
@@ -487,9 +487,9 @@ ESIMD_INLINE ESIMD_NODEBUG simd<T, n> slm_block_load(uint32_t offset) {
487487
static_assert(Sz % __esimd::OWORD == 0,
488488
"block size must be whole number of owords");
489489
static_assert(__esimd::isPowerOf2(Sz / __esimd::OWORD),
490-
"block must be 1, 2, 4 or 8 owords long");
491-
static_assert(Sz <= 8 * __esimd::OWORD,
492-
"block size must be at most 8 owords");
490+
"block must be 1, 2, 4, 8, 16 owords long");
491+
static_assert(Sz <= 16 * __esimd::OWORD,
492+
"block size must be at most 16 owords");
493493

494494
return __esimd_slm_block_read<T, n>(offset);
495495
}
@@ -503,9 +503,9 @@ ESIMD_INLINE ESIMD_NODEBUG void slm_block_store(uint32_t offset,
503503
static_assert(Sz % __esimd::OWORD == 0,
504504
"block size must be whole number of owords");
505505
static_assert(__esimd::isPowerOf2(Sz / __esimd::OWORD),
506-
"block must be 1, 2, 4 or 8 owords long");
507-
static_assert(Sz <= 8 * __esimd::OWORD,
508-
"block size must be at most 8 owords");
506+
"block must be 1, 2, 4, 8, or 16 owords long");
507+
static_assert(Sz <= 16 * __esimd::OWORD,
508+
"block size must be at most 16 owords");
509509

510510
// offset in genx.oword.st is in owords
511511
__esimd_slm_block_write<T, n>(offset >> 4, vals.data());

sycl/test/basic_tests/esimd/gather_scatter.cpp

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,22 @@
99
using namespace sycl::INTEL::gpu;
1010
using namespace cl::sycl;
1111

12-
void kernel(accessor<int, 1, access::mode::read_write, access::target::global_buffer> &buf) __attribute__((sycl_device)) {
12+
void kernel0(accessor<int, 1, access::mode::read_write,
13+
access::target::global_buffer> &buf)
14+
__attribute__((sycl_device)) {
15+
simd<uint32_t, 2> offsets(0, 1);
16+
simd<int, 2> v1(0, 1);
17+
18+
auto v0 = gather<int, 2>(buf.get_pointer(), offsets);
19+
20+
v0 = v0 + v1;
21+
22+
scatter<int, 2>(buf.get_pointer(), v0, offsets);
23+
}
24+
25+
void kernel(accessor<int, 1, access::mode::read_write,
26+
access::target::global_buffer> &buf)
27+
__attribute__((sycl_device)) {
1328
simd<uint32_t, 32> offsets(0, 1);
1429
simd<int, 32> v1(0, 1);
1530

@@ -20,7 +35,9 @@ void kernel(accessor<int, 1, access::mode::read_write, access::target::global_bu
2035
scatter<int, 32>(buf.get_pointer(), v0, offsets);
2136
}
2237

23-
void kernel(accessor<uint8_t, 1, access::mode::read_write, access::target::global_buffer> &buf) __attribute__((sycl_device)) {
38+
void kernel(accessor<uint8_t, 1, access::mode::read_write,
39+
access::target::global_buffer> &buf)
40+
__attribute__((sycl_device)) {
2441
simd<uint32_t, 32> offsets(0, 1);
2542
simd<uint8_t, 32> v1(0, 1);
2643

@@ -33,7 +50,9 @@ void kernel(accessor<uint8_t, 1, access::mode::read_write, access::target::globa
3350
scatter<uint8_t, 32>(buf.get_pointer(), v0, offsets);
3451
}
3552

36-
void kernel(accessor<uint16_t, 1, access::mode::read_write, access::target::global_buffer> &buf) __attribute__((sycl_device)) {
53+
void kernel(accessor<uint16_t, 1, access::mode::read_write,
54+
access::target::global_buffer> &buf)
55+
__attribute__((sycl_device)) {
3756
simd<uint32_t, 32> offsets(0, 1);
3857
simd<uint16_t, 32> v1(0, 1);
3958

sycl/test/basic_tests/esimd/slm_block.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,13 @@ void kernel() __attribute__((sycl_device)) {
1818

1919
slm_block_store<int, 32>(0, v0);
2020
}
21+
22+
void kernel2() __attribute__((sycl_device)) {
23+
simd<int, 64> v1(0, 1);
24+
25+
auto v0 = slm_block_load<int, 64>(0);
26+
27+
v0 = v0 + v1;
28+
29+
slm_block_store<int, 64>(0, v0);
30+
}

0 commit comments

Comments
 (0)