diff --git a/DirectProgramming/DPC++/Jupyter/oneapi-essentials-training/04_DPCPP_Sub_Groups/Sub_Groups.ipynb b/DirectProgramming/DPC++/Jupyter/oneapi-essentials-training/04_DPCPP_Sub_Groups/Sub_Groups.ipynb index 01e406d7ca..ba972d328e 100644 --- a/DirectProgramming/DPC++/Jupyter/oneapi-essentials-training/04_DPCPP_Sub_Groups/Sub_Groups.ipynb +++ b/DirectProgramming/DPC++/Jupyter/oneapi-essentials-training/04_DPCPP_Sub_Groups/Sub_Groups.ipynb @@ -145,7 +145,11 @@ "metadata": {}, "source": [ "```cpp\n", - " ONEAPI::sub_group sg = item.get_sub_group();\n", + " sycl::sub_group sg = nd_item.get_sub_group();\n", + "\n", + " OR\n", + "\n", + " auto sg = nd_item.get_sub_group();\n", "```" ] }, @@ -339,40 +343,40 @@ "#include \n", "using namespace sycl;\n", "\n", - "static constexpr size_t N = 256; // global size\n", + "static constexpr size_t N = 64; // global size\n", "static constexpr size_t B = 64; // work-group size\n", - "static constexpr size_t S = 16; // sub_group size\n", "\n", "int main() {\n", " queue q;\n", - " std::cout << \"Device : \" << q.get_device().get_info() << std::endl;\n", - " \n", + " std::cout << \"Device : \" << q.get_device().get_info() << \"\\n\";\n", + "\n", + " //# get all supported sub_group sizes and print\n", " auto sg_sizes = q.get_device().get_info();\n", " std::cout << \"Supported Sub-Group Sizes : \";\n", - " for (int i=0; i(N, q);\n", - " for(int i=0; i(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(8)]]{\n", - " auto sg = item.get_sub_group();\n", - " auto i = item.get_global_id(0);\n", + " std::cout << \"Max Sub-Group Size : \" << max_sg_size[0] << \"\\n\";\n", + " \n", + " q.submit([&](handler &h) {\n", + " //# setup sycl stream class to print standard output from device code\n", + " auto out = stream(1024, 768, h);\n", "\n", - " //# write sub_group tp zero except first location for each sub_group\n", - " if (sg.get_local_id()[0] != 0) data[i] = 0;\n", + " //# nd-range kernel with user specified sub_group size\n", + " h.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(32)]] {\n", + " //# get sub_group handle\n", + " auto sg = item.get_sub_group();\n", "\n", + " //# query sub_group and print sub_group info once per sub_group\n", + " if (sg.get_local_id()[0] == 0) {\n", + " out << \"sub_group id: \" << sg.get_group_id()[0] << \" of \"\n", + " << sg.get_group_range()[0] << \", size=\" << sg.get_local_range()[0]\n", + " << endl;\n", + " }\n", + " });\n", " }).wait();\n", - "\n", - " for(int i=0; i(N,B), [=](nd_item<1> item){\n", "      auto sg = item.get_sub_group();\n", - "      size_t i = item.get_global_id(0);\n", + "      auto i = item.get_global_id(0);\n", "      /* Shuffles */\n", - "      //data[i] = sg.shuffle(data[i], 2);\n", - "      //data[i] = sg.shuffle_up(data[i], 1);\n", - "      //data[i] = sg.shuffle_down(data[i], 1);\n", - "      data[i] = sg.shuffle_xor(data[i], 1);\n", + "      //data[i] = select_by_group(sg, data[i], 2);\n", + "      //data[i] = shift_group_left(sg, data[i], 1);\n", + "      //data[i] = shift_group_right(sg, data[i], 1);\n", + "      data[i] = permute_group_by_xor(sg, data[i], 1);\n", " });\n", "\n", "```\n", @@ -492,10 +496,10 @@ " auto i = item.get_global_id(0);\n", "\n", " //# swap adjacent items in array using sub_group shuffle_xor\n", - " data[i] = sg.shuffle_xor(data[i], 1);\n", + " data[i] = permute_group_by_xor(sg, data[i], 1);\n", " \n", " //# reverse the order of items in sub_group using shuffle_xor\n", - " //data[i] = sg.shuffle_xor(data[i], sg.get_max_local_range() - 1);\n", + " //data[i] = permute_group_by_xor(sg, data[i], sg.get_max_local_range() - 1);\n", " \n", " }).wait();\n", "\n", diff --git a/DirectProgramming/DPC++/Jupyter/oneapi-essentials-training/04_DPCPP_Sub_Groups/lab/sub_group_reqd_size.cpp b/DirectProgramming/DPC++/Jupyter/oneapi-essentials-training/04_DPCPP_Sub_Groups/lab/sub_group_reqd_size.cpp index bf402569d1..bc1960f73f 100644 --- a/DirectProgramming/DPC++/Jupyter/oneapi-essentials-training/04_DPCPP_Sub_Groups/lab/sub_group_reqd_size.cpp +++ b/DirectProgramming/DPC++/Jupyter/oneapi-essentials-training/04_DPCPP_Sub_Groups/lab/sub_group_reqd_size.cpp @@ -6,38 +6,37 @@ #include using namespace sycl; -static constexpr size_t N = 256; // global size +static constexpr size_t N = 64; // global size static constexpr size_t B = 64; // work-group size -static constexpr size_t S = 32; // sub_group size int main() { queue q; - std::cout << "Device : " << q.get_device().get_info() << std::endl; - + std::cout << "Device : " << q.get_device().get_info() << "\n"; + + //# get all supported sub_group sizes and print auto sg_sizes = q.get_device().get_info(); std::cout << "Supported Sub-Group Sizes : "; - for (int i=0; i(N, q); - for(int i=0; i(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(S)]] { - auto sg = item.get_sub_group(); - auto i = item.get_global_id(0); - - //# write sub_group tp zero except first location for each sub_group - if (sg.get_local_id()[0] != 0) data[i] = 0; - + std::cout << "Max Sub-Group Size : " << max_sg_size[0] << "\n"; + + q.submit([&](handler &h) { + //# setup sycl stream class to print standard output from device code + auto out = stream(1024, 768, h); + + //# nd-range kernel with user specified sub_group size + h.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(32)]] { + //# get sub_group handle + auto sg = item.get_sub_group(); + + //# query sub_group and print sub_group info once per sub_group + if (sg.get_local_id()[0] == 0) { + out << "sub_group id: " << sg.get_group_id()[0] << " of " + << sg.get_group_range()[0] << ", size=" << sg.get_local_range()[0] + << endl; + } + }); }).wait(); - - for(int i=0; i using namespace sycl; -static constexpr size_t N = 256; // global size +static constexpr size_t N = 64; // global size static constexpr size_t B = 64; // work-group size -static constexpr size_t S = 32; // sub_group size int main() { queue q; - std::cout << "Device : " << q.get_device().get_info() << std::endl; - + std::cout << "Device : " << q.get_device().get_info() << "\n"; + + //# get all supported sub_group sizes and print auto sg_sizes = q.get_device().get_info(); std::cout << "Supported Sub-Group Sizes : "; - for (int i=0; i(N, q); - for(int i=0; i(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(S)]] { - auto sg = item.get_sub_group(); - auto i = item.get_global_id(0); - - //# write sub_group tp zero except first location for each sub_group - if (sg.get_local_id()[0] != 0) data[i] = 0; - + std::cout << "Max Sub-Group Size : " << max_sg_size[0] << "\n"; + + q.submit([&](handler &h) { + //# setup sycl stream class to print standard output from device code + auto out = stream(1024, 768, h); + + //# nd-range kernel with user specified sub_group size + h.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(32)]] { + //# get sub_group handle + auto sg = item.get_sub_group(); + + //# query sub_group and print sub_group info once per sub_group + if (sg.get_local_id()[0] == 0) { + out << "sub_group id: " << sg.get_group_id()[0] << " of " + << sg.get_group_range()[0] << ", size=" << sg.get_local_range()[0] + << endl; + } + }); }).wait(); - - for(int i=0; i