Skip to content

Commit 180bb46

Browse files
authored
Updated DPC++ Sub-Groups Jupyter Notebook and code with SYCL 2020 format changes (#733)
* updated DPC++ SubGroups to sycl2020 format * fixed DPC++ sub-group sample code * fixed typo in DPC++ Sub-Groups Notebook
1 parent fa51f46 commit 180bb46

File tree

5 files changed

+91
-89
lines changed

5 files changed

+91
-89
lines changed

DirectProgramming/DPC++/Jupyter/oneapi-essentials-training/04_DPCPP_Sub_Groups/Sub_Groups.ipynb

Lines changed: 39 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,11 @@
145145
"metadata": {},
146146
"source": [
147147
"```cpp\n",
148-
" ONEAPI::sub_group sg = item.get_sub_group();\n",
148+
" sycl::sub_group sg = nd_item.get_sub_group();\n",
149+
"\n",
150+
" OR\n",
151+
"\n",
152+
" auto sg = nd_item.get_sub_group();\n",
149153
"```"
150154
]
151155
},
@@ -339,40 +343,40 @@
339343
"#include <CL/sycl.hpp>\n",
340344
"using namespace sycl;\n",
341345
"\n",
342-
"static constexpr size_t N = 256; // global size\n",
346+
"static constexpr size_t N = 64; // global size\n",
343347
"static constexpr size_t B = 64; // work-group size\n",
344-
"static constexpr size_t S = 16; // sub_group size\n",
345348
"\n",
346349
"int main() {\n",
347350
" queue q;\n",
348-
" std::cout << \"Device : \" << q.get_device().get_info<info::device::name>() << std::endl;\n",
349-
" \n",
351+
" std::cout << \"Device : \" << q.get_device().get_info<info::device::name>() << \"\\n\";\n",
352+
"\n",
353+
" //# get all supported sub_group sizes and print\n",
350354
" auto sg_sizes = q.get_device().get_info<info::device::sub_group_sizes>();\n",
351355
" std::cout << \"Supported Sub-Group Sizes : \";\n",
352-
" for (int i=0; i<sg_sizes.size(); i++) std::cout << sg_sizes[i] << \" \"; std::cout << std::endl;\n",
356+
" for (int i=0; i<sg_sizes.size(); i++) std::cout << sg_sizes[i] << \" \"; std::cout << \"\\n\";\n",
353357
" \n",
358+
" //# find out maximum supported sub_group size\n",
354359
" auto max_sg_size = std::max_element(sg_sizes.begin(), sg_sizes.end());\n",
355-
" std::cout << \"Max Sub-Group Size : \" << max_sg_size[0] << std::endl;\n",
356-
"\n",
357-
" //# initialize data array using usm\n",
358-
" int *data = malloc_shared<int>(N, q);\n",
359-
" for(int i=0; i<N; i++) data[i] = i;\n",
360-
"\n",
361-
" //# use parallel_for and sub_groups\n",
362-
" q.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(8)]]{\n",
363-
" auto sg = item.get_sub_group();\n",
364-
" auto i = item.get_global_id(0);\n",
360+
" std::cout << \"Max Sub-Group Size : \" << max_sg_size[0] << \"\\n\";\n",
361+
" \n",
362+
" q.submit([&](handler &h) {\n",
363+
" //# setup sycl stream class to print standard output from device code\n",
364+
" auto out = stream(1024, 768, h);\n",
365365
"\n",
366-
" //# write sub_group tp zero except first location for each sub_group\n",
367-
" if (sg.get_local_id()[0] != 0) data[i] = 0;\n",
366+
" //# nd-range kernel with user specified sub_group size\n",
367+
" h.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(32)]] {\n",
368+
" //# get sub_group handle\n",
369+
" auto sg = item.get_sub_group();\n",
368370
"\n",
371+
" //# query sub_group and print sub_group info once per sub_group\n",
372+
" if (sg.get_local_id()[0] == 0) {\n",
373+
" out << \"sub_group id: \" << sg.get_group_id()[0] << \" of \"\n",
374+
" << sg.get_group_range()[0] << \", size=\" << sg.get_local_range()[0]\n",
375+
" << endl;\n",
376+
" }\n",
377+
" });\n",
369378
" }).wait();\n",
370-
"\n",
371-
" for(int i=0; i<N; i++) std::cout << data[i] << \" \"; std::cout << std::endl;\n",
372-
" \n",
373-
" free(data, q);\n",
374-
" return 0;\n",
375-
"}\n"
379+
"}"
376380
]
377381
},
378382
{
@@ -416,22 +420,22 @@
416420
"Shuffle operations enable us to remove work-group local memory usage from our kernels and/or to __avoid unnecessary repeated accesses to global memory__.\n",
417421
"\n",
418422
"Below are the different types of shuffle operations available for sub-groups:\n",
419-
"- shuffle\n",
420-
"- shuffle_up\n",
421-
"- shuffle_down\n",
422-
"- shuffle_xor\n",
423+
"- `select_by_group(sg, x, id)`\n",
424+
"- `shift_group_left(sg, x, delta)`\n",
425+
"- `shift_group_right(sg, x, delta)`\n",
426+
"- `permute_group_by_xor(sg, x, mask)`\n",
423427
"\n",
424428
"The code below uses `shuffle_xor` to swap the values of two work-items:\n",
425429
"\n",
426430
"```cpp\n",
427431
" h.parallel_for(nd_range<1>(N,B), [=](nd_item<1> item){\n",
428432
"      auto sg = item.get_sub_group();\n",
429-
"      size_t i = item.get_global_id(0);\n",
433+
"      auto i = item.get_global_id(0);\n",
430434
"      /* Shuffles */\n",
431-
"      //data[i] = sg.shuffle(data[i], 2);\n",
432-
"      //data[i] = sg.shuffle_up(data[i], 1);\n",
433-
"      //data[i] = sg.shuffle_down(data[i], 1);\n",
434-
"      data[i] = sg.shuffle_xor(data[i], 1);\n",
435+
"      //data[i] = select_by_group(sg, data[i], 2);\n",
436+
"      //data[i] = shift_group_left(sg, data[i], 1);\n",
437+
"      //data[i] = shift_group_right(sg, data[i], 1);\n",
438+
"      data[i] = permute_group_by_xor(sg, data[i], 1);\n",
435439
" });\n",
436440
"\n",
437441
"```\n",
@@ -492,10 +496,10 @@
492496
" auto i = item.get_global_id(0);\n",
493497
"\n",
494498
" //# swap adjacent items in array using sub_group shuffle_xor\n",
495-
" data[i] = sg.shuffle_xor(data[i], 1);\n",
499+
" data[i] = permute_group_by_xor(sg, data[i], 1);\n",
496500
" \n",
497501
" //# reverse the order of items in sub_group using shuffle_xor\n",
498-
" //data[i] = sg.shuffle_xor(data[i], sg.get_max_local_range() - 1);\n",
502+
" //data[i] = permute_group_by_xor(sg, data[i], sg.get_max_local_range() - 1);\n",
499503
" \n",
500504
" }).wait();\n",
501505
"\n",

DirectProgramming/DPC++/Jupyter/oneapi-essentials-training/04_DPCPP_Sub_Groups/lab/sub_group_reqd_size.cpp

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,38 +6,37 @@
66
#include <CL/sycl.hpp>
77
using namespace sycl;
88

9-
static constexpr size_t N = 256; // global size
9+
static constexpr size_t N = 64; // global size
1010
static constexpr size_t B = 64; // work-group size
11-
static constexpr size_t S = 32; // sub_group size
1211

1312
int main() {
1413
queue q;
15-
std::cout << "Device : " << q.get_device().get_info<info::device::name>() << std::endl;
16-
14+
std::cout << "Device : " << q.get_device().get_info<info::device::name>() << "\n";
15+
16+
//# get all supported sub_group sizes and print
1717
auto sg_sizes = q.get_device().get_info<info::device::sub_group_sizes>();
1818
std::cout << "Supported Sub-Group Sizes : ";
19-
for (int i=0; i<sg_sizes.size(); i++) std::cout << sg_sizes[i] << " "; std::cout << std::endl;
19+
for (int i=0; i<sg_sizes.size(); i++) std::cout << sg_sizes[i] << " "; std::cout << "\n";
2020

21+
//# find out maximum supported sub_group size
2122
auto max_sg_size = std::max_element(sg_sizes.begin(), sg_sizes.end());
22-
std::cout << "Max Sub-Group Size : " << max_sg_size[0] << std::endl;
23-
24-
//# initialize data array using usm
25-
int *data = malloc_shared<int>(N, q);
26-
for(int i=0; i<N; i++) data[i] = i;
27-
28-
//# use parallel_for and sub_groups
29-
q.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(S)]] {
30-
auto sg = item.get_sub_group();
31-
auto i = item.get_global_id(0);
32-
33-
//# write sub_group tp zero except first location for each sub_group
34-
if (sg.get_local_id()[0] != 0) data[i] = 0;
35-
23+
std::cout << "Max Sub-Group Size : " << max_sg_size[0] << "\n";
24+
25+
q.submit([&](handler &h) {
26+
//# setup sycl stream class to print standard output from device code
27+
auto out = stream(1024, 768, h);
28+
29+
//# nd-range kernel with user specified sub_group size
30+
h.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(32)]] {
31+
//# get sub_group handle
32+
auto sg = item.get_sub_group();
33+
34+
//# query sub_group and print sub_group info once per sub_group
35+
if (sg.get_local_id()[0] == 0) {
36+
out << "sub_group id: " << sg.get_group_id()[0] << " of "
37+
<< sg.get_group_range()[0] << ", size=" << sg.get_local_range()[0]
38+
<< endl;
39+
}
40+
});
3641
}).wait();
37-
38-
for(int i=0; i<N; i++) std::cout << data[i] << " "; std::cout << std::endl;
39-
40-
free(data, q);
41-
return 0;
4242
}
43-

DirectProgramming/DPC++/Jupyter/oneapi-essentials-training/04_DPCPP_Sub_Groups/lab/sub_group_shuffle.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@ int main() {
2525
auto i = item.get_global_id(0);
2626

2727
//# swap adjacent items in array using sub_group shuffle_xor
28-
data[i] = sg.shuffle_xor(data[i], 1);
28+
data[i] = permute_group_by_xor(sg, data[i], 1);
2929

3030
//# reverse the order of items in sub_group using shuffle_xor
31-
//data[i] = sg.shuffle_xor(data[i], sg.get_max_local_range() - 1);
31+
//data[i] = permute_group_by_xor(sg, data[i], sg.get_max_local_range() - 1);
3232

3333
}).wait();
3434

DirectProgramming/DPC++/Jupyter/oneapi-essentials-training/04_DPCPP_Sub_Groups/src/sub_group_reqd_size.cpp

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,38 +6,37 @@
66
#include <CL/sycl.hpp>
77
using namespace sycl;
88

9-
static constexpr size_t N = 256; // global size
9+
static constexpr size_t N = 64; // global size
1010
static constexpr size_t B = 64; // work-group size
11-
static constexpr size_t S = 32; // sub_group size
1211

1312
int main() {
1413
queue q;
15-
std::cout << "Device : " << q.get_device().get_info<info::device::name>() << std::endl;
16-
14+
std::cout << "Device : " << q.get_device().get_info<info::device::name>() << "\n";
15+
16+
//# get all supported sub_group sizes and print
1717
auto sg_sizes = q.get_device().get_info<info::device::sub_group_sizes>();
1818
std::cout << "Supported Sub-Group Sizes : ";
19-
for (int i=0; i<sg_sizes.size(); i++) std::cout << sg_sizes[i] << " "; std::cout << std::endl;
19+
for (int i=0; i<sg_sizes.size(); i++) std::cout << sg_sizes[i] << " "; std::cout << "\n";
2020

21+
//# find out maximum supported sub_group size
2122
auto max_sg_size = std::max_element(sg_sizes.begin(), sg_sizes.end());
22-
std::cout << "Max Sub-Group Size : " << max_sg_size[0] << std::endl;
23-
24-
//# initialize data array using usm
25-
int *data = malloc_shared<int>(N, q);
26-
for(int i=0; i<N; i++) data[i] = i;
27-
28-
//# use parallel_for and sub_groups
29-
q.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(S)]] {
30-
auto sg = item.get_sub_group();
31-
auto i = item.get_global_id(0);
32-
33-
//# write sub_group tp zero except first location for each sub_group
34-
if (sg.get_local_id()[0] != 0) data[i] = 0;
35-
23+
std::cout << "Max Sub-Group Size : " << max_sg_size[0] << "\n";
24+
25+
q.submit([&](handler &h) {
26+
//# setup sycl stream class to print standard output from device code
27+
auto out = stream(1024, 768, h);
28+
29+
//# nd-range kernel with user specified sub_group size
30+
h.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(32)]] {
31+
//# get sub_group handle
32+
auto sg = item.get_sub_group();
33+
34+
//# query sub_group and print sub_group info once per sub_group
35+
if (sg.get_local_id()[0] == 0) {
36+
out << "sub_group id: " << sg.get_group_id()[0] << " of "
37+
<< sg.get_group_range()[0] << ", size=" << sg.get_local_range()[0]
38+
<< endl;
39+
}
40+
});
3641
}).wait();
37-
38-
for(int i=0; i<N; i++) std::cout << data[i] << " "; std::cout << std::endl;
39-
40-
free(data, q);
41-
return 0;
4242
}
43-

DirectProgramming/DPC++/Jupyter/oneapi-essentials-training/04_DPCPP_Sub_Groups/src/sub_group_shuffle.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,10 @@ int main() {
2525
auto i = item.get_global_id(0);
2626

2727
//# swap adjacent items in array using sub_group shuffle_xor
28-
data[i] = sg.shuffle_xor(data[i], 1);
28+
data[i] = permute_group_by_xor(sg, data[i], 1);
2929

3030
//# reverse the order of items in sub_group using shuffle_xor
31-
//data[i] = sg.shuffle_xor(data[i], sg.get_max_local_range() - 1);
31+
//data[i] = permute_group_by_xor(sg, data[i], sg.get_max_local_range() - 1);
3232

3333
}).wait();
3434

0 commit comments

Comments
 (0)