Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,11 @@
"metadata": {},
"source": [
"```cpp\n",
" ONEAPI::sub_group sg = item.get_sub_group();\n",
" sycl::sub_group sg = nd_item.get_sub_group();\n",
"\n",
" OR\n",
"\n",
" auto sg = nd_item.get_sub_group();\n",
"```"
]
},
Expand Down Expand Up @@ -339,40 +343,40 @@
"#include <CL/sycl.hpp>\n",
"using namespace sycl;\n",
"\n",
"static constexpr size_t N = 256; // global size\n",
"static constexpr size_t N = 64; // global size\n",
"static constexpr size_t B = 64; // work-group size\n",
"static constexpr size_t S = 16; // sub_group size\n",
"\n",
"int main() {\n",
" queue q;\n",
" std::cout << \"Device : \" << q.get_device().get_info<info::device::name>() << std::endl;\n",
" \n",
" std::cout << \"Device : \" << q.get_device().get_info<info::device::name>() << \"\\n\";\n",
"\n",
" //# get all supported sub_group sizes and print\n",
" auto sg_sizes = q.get_device().get_info<info::device::sub_group_sizes>();\n",
" std::cout << \"Supported Sub-Group Sizes : \";\n",
" for (int i=0; i<sg_sizes.size(); i++) std::cout << sg_sizes[i] << \" \"; std::cout << std::endl;\n",
" for (int i=0; i<sg_sizes.size(); i++) std::cout << sg_sizes[i] << \" \"; std::cout << \"\\n\";\n",
" \n",
" //# find out maximum supported sub_group size\n",
" auto max_sg_size = std::max_element(sg_sizes.begin(), sg_sizes.end());\n",
" std::cout << \"Max Sub-Group Size : \" << max_sg_size[0] << std::endl;\n",
"\n",
" //# initialize data array using usm\n",
" int *data = malloc_shared<int>(N, q);\n",
" for(int i=0; i<N; i++) data[i] = i;\n",
"\n",
" //# use parallel_for and sub_groups\n",
" q.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(8)]]{\n",
" auto sg = item.get_sub_group();\n",
" auto i = item.get_global_id(0);\n",
" std::cout << \"Max Sub-Group Size : \" << max_sg_size[0] << \"\\n\";\n",
" \n",
" q.submit([&](handler &h) {\n",
" //# setup sycl stream class to print standard output from device code\n",
" auto out = stream(1024, 768, h);\n",
"\n",
" //# write sub_group tp zero except first location for each sub_group\n",
" if (sg.get_local_id()[0] != 0) data[i] = 0;\n",
" //# nd-range kernel with user specified sub_group size\n",
" h.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(32)]] {\n",
" //# get sub_group handle\n",
" auto sg = item.get_sub_group();\n",
"\n",
" //# query sub_group and print sub_group info once per sub_group\n",
" if (sg.get_local_id()[0] == 0) {\n",
" out << \"sub_group id: \" << sg.get_group_id()[0] << \" of \"\n",
" << sg.get_group_range()[0] << \", size=\" << sg.get_local_range()[0]\n",
" << endl;\n",
" }\n",
" });\n",
" }).wait();\n",
"\n",
" for(int i=0; i<N; i++) std::cout << data[i] << \" \"; std::cout << std::endl;\n",
" \n",
" free(data, q);\n",
" return 0;\n",
"}\n"
"}"
]
},
{
Expand Down Expand Up @@ -416,22 +420,22 @@
"Shuffle operations enable us to remove work-group local memory usage from our kernels and/or to __avoid unnecessary repeated accesses to global memory__.\n",
"\n",
"Below are the different types of shuffle operations available for sub-groups:\n",
"- shuffle\n",
"- shuffle_up\n",
"- shuffle_down\n",
"- shuffle_xor\n",
"- `select_by_group(sg, x, id)`\n",
"- `shift_group_left(sg, x, delta)`\n",
"- `shift_group_right(sg, x, delta)`\n",
"- `permute_group_by_xor(sg, x, mask)`\n",
"\n",
"The code below uses `shuffle_xor` to swap the values of two work-items:\n",
"\n",
"```cpp\n",
" h.parallel_for(nd_range<1>(N,B), [=](nd_item<1> item){\n",
"      auto sg = item.get_sub_group();\n",
"      size_t i = item.get_global_id(0);\n",
"      auto i = item.get_global_id(0);\n",
"      /* Shuffles */\n",
"      //data[i] = sg.shuffle(data[i], 2);\n",
"      //data[i] = sg.shuffle_up(data[i], 1);\n",
"      //data[i] = sg.shuffle_down(data[i], 1);\n",
"      data[i] = sg.shuffle_xor(data[i], 1);\n",
"      //data[i] = select_by_group(sg, data[i], 2);\n",
"      //data[i] = shift_group_left(sg, data[i], 1);\n",
"      //data[i] = shift_group_right(sg, data[i], 1);\n",
"      data[i] = permute_group_by_xor(sg, data[i], 1);\n",
" });\n",
"\n",
"```\n",
Expand Down Expand Up @@ -492,10 +496,10 @@
" auto i = item.get_global_id(0);\n",
"\n",
" //# swap adjacent items in array using sub_group shuffle_xor\n",
" data[i] = sg.shuffle_xor(data[i], 1);\n",
" data[i] = permute_group_by_xor(sg, data[i], 1);\n",
" \n",
" //# reverse the order of items in sub_group using shuffle_xor\n",
" //data[i] = sg.shuffle_xor(data[i], sg.get_max_local_range() - 1);\n",
" //data[i] = permute_group_by_xor(sg, data[i], sg.get_max_local_range() - 1);\n",
" \n",
" }).wait();\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,38 +6,37 @@
#include <CL/sycl.hpp>
using namespace sycl;

static constexpr size_t N = 256; // global size
static constexpr size_t N = 64; // global size
static constexpr size_t B = 64; // work-group size
static constexpr size_t S = 32; // sub_group size

int main() {
queue q;
std::cout << "Device : " << q.get_device().get_info<info::device::name>() << std::endl;

std::cout << "Device : " << q.get_device().get_info<info::device::name>() << "\n";

//# get all supported sub_group sizes and print
auto sg_sizes = q.get_device().get_info<info::device::sub_group_sizes>();
std::cout << "Supported Sub-Group Sizes : ";
for (int i=0; i<sg_sizes.size(); i++) std::cout << sg_sizes[i] << " "; std::cout << std::endl;
for (int i=0; i<sg_sizes.size(); i++) std::cout << sg_sizes[i] << " "; std::cout << "\n";

//# find out maximum supported sub_group size
auto max_sg_size = std::max_element(sg_sizes.begin(), sg_sizes.end());
std::cout << "Max Sub-Group Size : " << max_sg_size[0] << std::endl;

//# initialize data array using usm
int *data = malloc_shared<int>(N, q);
for(int i=0; i<N; i++) data[i] = i;

//# use parallel_for and sub_groups
q.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(S)]] {
auto sg = item.get_sub_group();
auto i = item.get_global_id(0);

//# write sub_group tp zero except first location for each sub_group
if (sg.get_local_id()[0] != 0) data[i] = 0;

std::cout << "Max Sub-Group Size : " << max_sg_size[0] << "\n";

q.submit([&](handler &h) {
//# setup sycl stream class to print standard output from device code
auto out = stream(1024, 768, h);

//# nd-range kernel with user specified sub_group size
h.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(32)]] {
//# get sub_group handle
auto sg = item.get_sub_group();

//# query sub_group and print sub_group info once per sub_group
if (sg.get_local_id()[0] == 0) {
out << "sub_group id: " << sg.get_group_id()[0] << " of "
<< sg.get_group_range()[0] << ", size=" << sg.get_local_range()[0]
<< endl;
}
});
}).wait();

for(int i=0; i<N; i++) std::cout << data[i] << " "; std::cout << std::endl;

free(data, q);
return 0;
}

Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ int main() {
auto i = item.get_global_id(0);

//# swap adjacent items in array using sub_group shuffle_xor
data[i] = sg.shuffle_xor(data[i], 1);
data[i] = permute_group_by_xor(sg, data[i], 1);

//# reverse the order of items in sub_group using shuffle_xor
//data[i] = sg.shuffle_xor(data[i], sg.get_max_local_range() - 1);
//data[i] = permute_group_by_xor(sg, data[i], sg.get_max_local_range() - 1);

}).wait();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,38 +6,37 @@
#include <CL/sycl.hpp>
using namespace sycl;

static constexpr size_t N = 256; // global size
static constexpr size_t N = 64; // global size
static constexpr size_t B = 64; // work-group size
static constexpr size_t S = 32; // sub_group size

int main() {
queue q;
std::cout << "Device : " << q.get_device().get_info<info::device::name>() << std::endl;

std::cout << "Device : " << q.get_device().get_info<info::device::name>() << "\n";

//# get all supported sub_group sizes and print
auto sg_sizes = q.get_device().get_info<info::device::sub_group_sizes>();
std::cout << "Supported Sub-Group Sizes : ";
for (int i=0; i<sg_sizes.size(); i++) std::cout << sg_sizes[i] << " "; std::cout << std::endl;
for (int i=0; i<sg_sizes.size(); i++) std::cout << sg_sizes[i] << " "; std::cout << "\n";

//# find out maximum supported sub_group size
auto max_sg_size = std::max_element(sg_sizes.begin(), sg_sizes.end());
std::cout << "Max Sub-Group Size : " << max_sg_size[0] << std::endl;

//# initialize data array using usm
int *data = malloc_shared<int>(N, q);
for(int i=0; i<N; i++) data[i] = i;

//# use parallel_for and sub_groups
q.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(S)]] {
auto sg = item.get_sub_group();
auto i = item.get_global_id(0);

//# write sub_group tp zero except first location for each sub_group
if (sg.get_local_id()[0] != 0) data[i] = 0;

std::cout << "Max Sub-Group Size : " << max_sg_size[0] << "\n";

q.submit([&](handler &h) {
//# setup sycl stream class to print standard output from device code
auto out = stream(1024, 768, h);

//# nd-range kernel with user specified sub_group size
h.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(32)]] {
//# get sub_group handle
auto sg = item.get_sub_group();

//# query sub_group and print sub_group info once per sub_group
if (sg.get_local_id()[0] == 0) {
out << "sub_group id: " << sg.get_group_id()[0] << " of "
<< sg.get_group_range()[0] << ", size=" << sg.get_local_range()[0]
<< endl;
}
});
}).wait();

for(int i=0; i<N; i++) std::cout << data[i] << " "; std::cout << std::endl;

free(data, q);
return 0;
}

Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ int main() {
auto i = item.get_global_id(0);

//# swap adjacent items in array using sub_group shuffle_xor
data[i] = sg.shuffle_xor(data[i], 1);
data[i] = permute_group_by_xor(sg, data[i], 1);

//# reverse the order of items in sub_group using shuffle_xor
//data[i] = sg.shuffle_xor(data[i], sg.get_max_local_range() - 1);
//data[i] = permute_group_by_xor(sg, data[i], sg.get_max_local_range() - 1);

}).wait();

Expand Down