|
145 | 145 | "metadata": {},
|
146 | 146 | "source": [
|
147 | 147 | "```cpp\n",
|
148 |
| - " ONEAPI::sub_group sg = item.get_sub_group();\n", |
| 148 | + " sycl::sub_group sg = nd_item.get_sub_group();\n", |
| 149 | + "\n", |
| 150 | + " OR\n", |
| 151 | + "\n", |
| 152 | + " auto sg = nd_item.get_sub_group();\n", |
149 | 153 | "```"
|
150 | 154 | ]
|
151 | 155 | },
|
|
339 | 343 | "#include <CL/sycl.hpp>\n",
|
340 | 344 | "using namespace sycl;\n",
|
341 | 345 | "\n",
|
342 |
| - "static constexpr size_t N = 256; // global size\n", |
| 346 | + "static constexpr size_t N = 64; // global size\n", |
343 | 347 | "static constexpr size_t B = 64; // work-group size\n",
|
344 |
| - "static constexpr size_t S = 16; // sub_group size\n", |
345 | 348 | "\n",
|
346 | 349 | "int main() {\n",
|
347 | 350 | " queue q;\n",
|
348 |
| - " std::cout << \"Device : \" << q.get_device().get_info<info::device::name>() << std::endl;\n", |
349 |
| - " \n", |
| 351 | + " std::cout << \"Device : \" << q.get_device().get_info<info::device::name>() << \"\\n\";\n", |
| 352 | + "\n", |
| 353 | + " //# get all supported sub_group sizes and print\n", |
350 | 354 | " auto sg_sizes = q.get_device().get_info<info::device::sub_group_sizes>();\n",
|
351 | 355 | " std::cout << \"Supported Sub-Group Sizes : \";\n",
|
352 |
| - " for (int i=0; i<sg_sizes.size(); i++) std::cout << sg_sizes[i] << \" \"; std::cout << std::endl;\n", |
| 356 | + " for (int i=0; i<sg_sizes.size(); i++) std::cout << sg_sizes[i] << \" \"; std::cout << \"\\n\";\n", |
353 | 357 | " \n",
|
| 358 | + " //# find out maximum supported sub_group size\n", |
354 | 359 | " auto max_sg_size = std::max_element(sg_sizes.begin(), sg_sizes.end());\n",
|
355 |
| - " std::cout << \"Max Sub-Group Size : \" << max_sg_size[0] << std::endl;\n", |
356 |
| - "\n", |
357 |
| - " //# initialize data array using usm\n", |
358 |
| - " int *data = malloc_shared<int>(N, q);\n", |
359 |
| - " for(int i=0; i<N; i++) data[i] = i;\n", |
360 |
| - "\n", |
361 |
| - " //# use parallel_for and sub_groups\n", |
362 |
| - " q.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(8)]]{\n", |
363 |
| - " auto sg = item.get_sub_group();\n", |
364 |
| - " auto i = item.get_global_id(0);\n", |
| 360 | + " std::cout << \"Max Sub-Group Size : \" << max_sg_size[0] << \"\\n\";\n", |
| 361 | + " \n", |
| 362 | + " q.submit([&](handler &h) {\n", |
| 363 | + " //# setup sycl stream class to print standard output from device code\n", |
| 364 | + " auto out = stream(1024, 768, h);\n", |
365 | 365 | "\n",
|
366 |
| - " //# write sub_group tp zero except first location for each sub_group\n", |
367 |
| - " if (sg.get_local_id()[0] != 0) data[i] = 0;\n", |
| 366 | + " //# nd-range kernel with user specified sub_group size\n", |
| 367 | + " h.parallel_for(nd_range<1>(N, B), [=](nd_item<1> item)[[intel::reqd_sub_group_size(32)]] {\n", |
| 368 | + " //# get sub_group handle\n", |
| 369 | + " auto sg = item.get_sub_group();\n", |
368 | 370 | "\n",
|
| 371 | + " //# query sub_group and print sub_group info once per sub_group\n", |
| 372 | + " if (sg.get_local_id()[0] == 0) {\n", |
| 373 | + " out << \"sub_group id: \" << sg.get_group_id()[0] << \" of \"\n", |
| 374 | + " << sg.get_group_range()[0] << \", size=\" << sg.get_local_range()[0]\n", |
| 375 | + " << endl;\n", |
| 376 | + " }\n", |
| 377 | + " });\n", |
369 | 378 | " }).wait();\n",
|
370 |
| - "\n", |
371 |
| - " for(int i=0; i<N; i++) std::cout << data[i] << \" \"; std::cout << std::endl;\n", |
372 |
| - " \n", |
373 |
| - " free(data, q);\n", |
374 |
| - " return 0;\n", |
375 |
| - "}\n" |
| 379 | + "}" |
376 | 380 | ]
|
377 | 381 | },
|
378 | 382 | {
|
|
416 | 420 | "Shuffle operations enable us to remove work-group local memory usage from our kernels and/or to __avoid unnecessary repeated accesses to global memory__.\n",
|
417 | 421 | "\n",
|
418 | 422 | "Below are the different types of shuffle operations available for sub-groups:\n",
|
419 |
| - "- shuffle\n", |
420 |
| - "- shuffle_up\n", |
421 |
| - "- shuffle_down\n", |
422 |
| - "- shuffle_xor\n", |
| 423 | + "- `select_by_group(sg, x, id)`\n", |
| 424 | + "- `shift_group_left(sg, x, delta)`\n", |
| 425 | + "- `shift_group_right(sg, x, delta)`\n", |
| 426 | + "- `permute_group_by_xor(sg, x, mask)`\n", |
423 | 427 | "\n",
|
424 | 428 | "The code below uses `shuffle_xor` to swap the values of two work-items:\n",
|
425 | 429 | "\n",
|
426 | 430 | "```cpp\n",
|
427 | 431 | " h.parallel_for(nd_range<1>(N,B), [=](nd_item<1> item){\n",
|
428 | 432 | " auto sg = item.get_sub_group();\n",
|
429 |
| - " size_t i = item.get_global_id(0);\n", |
| 433 | + " auto i = item.get_global_id(0);\n", |
430 | 434 | " /* Shuffles */\n",
|
431 |
| - " //data[i] = sg.shuffle(data[i], 2);\n", |
432 |
| - " //data[i] = sg.shuffle_up(data[i], 1);\n", |
433 |
| - " //data[i] = sg.shuffle_down(data[i], 1);\n", |
434 |
| - " data[i] = sg.shuffle_xor(data[i], 1);\n", |
| 435 | + " //data[i] = select_by_group(sg, data[i], 2);\n", |
| 436 | + " //data[i] = shift_group_left(sg, data[i], 1);\n", |
| 437 | + " //data[i] = shift_group_right(sg, data[i], 1);\n", |
| 438 | + " data[i] = permute_group_by_xor(sg, data[i], 1);\n", |
435 | 439 | " });\n",
|
436 | 440 | "\n",
|
437 | 441 | "```\n",
|
|
492 | 496 | " auto i = item.get_global_id(0);\n",
|
493 | 497 | "\n",
|
494 | 498 | " //# swap adjacent items in array using sub_group shuffle_xor\n",
|
495 |
| - " data[i] = sg.shuffle_xor(data[i], 1);\n", |
| 499 | + " data[i] = permute_group_by_xor(sg, data[i], 1);\n", |
496 | 500 | " \n",
|
497 | 501 | " //# reverse the order of items in sub_group using shuffle_xor\n",
|
498 |
| - " //data[i] = sg.shuffle_xor(data[i], sg.get_max_local_range() - 1);\n", |
| 502 | + " //data[i] = permute_group_by_xor(sg, data[i], sg.get_max_local_range() - 1);\n", |
499 | 503 | " \n",
|
500 | 504 | " }).wait();\n",
|
501 | 505 | "\n",
|
|
0 commit comments