From a015f1e5762ba3618d648616de192a5b05d9db01 Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Mon, 13 Jul 2020 15:03:48 -0700 Subject: [PATCH 01/23] Add bitonic-sort sample. --- .../bitonic-sort/CMakeLists.txt | 30 +++ .../GraphTraversal/bitonic-sort/License.txt | 7 + .../GraphTraversal/bitonic-sort/README.md | 120 +++++++++ .../bitonic-sort/bitonic-sort.sln | 25 ++ .../bitonic-sort/bitonic-sort.vcxproj | 137 ++++++++++ .../bitonic-sort/bitonic-sort.vcxproj.filters | 22 ++ .../bitonic-sort/bitonic-sort.vcxproj.user | 11 + .../GraphTraversal/bitonic-sort/sample.json | 29 ++ .../bitonic-sort/src/bitonic-sort.cpp | 253 ++++++++++++++++++ 9 files changed, 634 insertions(+) create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/CMakeLists.txt create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.sln create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.filters create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.user create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/CMakeLists.txt b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/CMakeLists.txt new file mode 100644 index 0000000000..b9c9718926 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/CMakeLists.txt @@ -0,0 +1,30 @@ +# required cmake version +cmake_minimum_required(VERSION 3.5) + +project (bitonic-sort) + +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp-cl") +else() + set(CMAKE_CXX_COMPILER "dpcpp") +endif() + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17") + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl") + +add_executable (bitonic-sort src/bitonic-sort.cpp) + +add_custom_target (run + COMMAND bitonic-sort 21 47 + WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} +) + diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt new file mode 100644 index 0000000000..6e9524bd74 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt @@ -0,0 +1,7 @@ +Copyright 2020 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md new file mode 100644 index 0000000000..8066fd1f31 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md @@ -0,0 +1,120 @@ +# `Bitonic Sort` sample + +This code sample demonstrates the implementation of bitonic sort using Intel Data Parallel C++ to +offload the computation to a GPU. In this implementation, a random sequence of 2**n elements is given +(n is a positive number) as input, the algorithm sorts the sequence in parallel. The result sequence is +in ascending order. + +For comprehensive instructions regarding DPC++ Programming, go to +https://software.intel.com/en-us/oneapi-programming-guide +and search based on relevant terms noted in the comments. + +| Optimized for | Description +|:--- |:--- +| OS | Linux Ubuntu 18.04 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) +| What you will learn | Implement bitonic sort using Intel DPC++ compiler +| Time to complete | 15 minutes + + +## Purpose + +The algorithm converts a randomized sequence of numbers into +a bitonic sequence (two ordered sequences), and then merge these two ordered +sequences into a ordered sequence. Bitonic sort algorithm is briefly +described as followed: + +- First, it decomposes the randomized sequence of size 2\*\*n into 2\*\*(n-1) +pairs where each pair consists of 2 consecutive elements. Note that each pair +is a bitonic sequence. +- Step 0: for each pair (sequence of size 2), the two elements are swapped so +that the two consecutive pairs form a bitonic sequence in increasing order, +the next two pairs form the second bitonic sequence in decreasing order, the +next two pairs form the third bitonic sequence in increasing order, etc, .... +At the end of this step, we have 2\*\*(n-1) bitonic sequences of size 2, and +they follow an order increasing, decreasing, increasing, .., decreasing. +Thus, they form 2\*\*(n-2) bitonic sequences of size 4. +- Step 1: for each new 2\*\*(n-2) bitonic sequences of size 4, (each new +sequence consists of 2 consecutive previous sequences), it swaps the elements +so that at the end of step 1, we have 2\*\*(n-2) bitonic sequences of size 4, +and they follow an order: increasing, decreasing, increasing, ..., +decreasing. Thus, they form 2\*\*(n-3) bitonic sequences of size 8. +- Same logic applies until we reach the last step. +- Step n: at this last step, we have one bitonic sequence of size 2\*\*n. The +elements in the sequence are swapped until we have a sequence in increasing +oder. + +The code will attempt first to execute on an available GPU and fallback to the system's CPU +if a compatible GPU is not detected. + +## Key Implementation Details + +The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command g +roups. Unified Shared Memory (USM) is used for data management. + +## License +This code sample is licensed under MIT license + +## Building the `bitonic-sort` Program for CPU and GPU + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, +FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI +Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System +1. Build the program using the following `cmake` commands. + ``` + $ cd bitonic-sort + $ mkdir build + $ cd build + $ cmake .. + $ make + ``` + +2. Run the program: + ``` + make run + ``` + +3. Clean the program using: + ``` + make clean + ``` + +### On a Windows* System + * Build the program using VS2017 or VS2019 + Right click on the solution file and open using either VS2017 or VS2019 IDE. + Right click on the project in Solution explorer and select Rebuild. + From top menu select Debug -> Start without Debugging. + + * Build the program using MSBuild + Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for + VS2019" + Run - MSBuild bitonic-sort.sln /t:Rebuild /p:Configuration="Release" + +## Running the sample +### Application Parameters + + Usage: bitonic-sort + +where + +exponent is a positive number. The according length of the sequence is 2**exponent. + +seed is the seed used by the random generator to generate the randomness. + +The sample offloads the computation to GPU and then performs the computation in serial in the CPU. +The results from the parallel and serial computation are compared. If the results are matched and +the ascending order is verified, the application will display a “Success!” message. + +### Example of Output +``` +$ ./bitonic-sort 21 47 +Array size: 2097152, seed: 47 +Device: Intel(R) Gen9 HD Graphics NEO +Kernel time: 0.416827 sec +CPU serial time: 0.60523 sec +Success! +``` \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.sln b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.sln new file mode 100644 index 0000000000..e558305981 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.1062 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bitonic-sort", "bitonic-sort.vcxproj", "{46454D0B-76F3-45EB-A186-F315A2E22DEA}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Debug|x64.ActiveCfg = Debug|x64 + {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Debug|x64.Build.0 = Debug|x64 + {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Release|x64.ActiveCfg = Release|x64 + {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {B1D84B81-F5D5-4459-AA6E-38B695FB908B} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj new file mode 100644 index 0000000000..9289102064 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj @@ -0,0 +1,137 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + 15.0 + {46454d0b-76f3-45eb-a186-f315a2e22dea} + Win32Proj + bitonic_sort + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + + + + + Console + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + + + + + + + + + + + Console + true + true + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.filters b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.filters new file mode 100644 index 0000000000..82a4ddcfc9 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.filters @@ -0,0 +1,22 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.user b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.user new file mode 100644 index 0000000000..582de7464a --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.user @@ -0,0 +1,11 @@ + + + + 21 47 + WindowsLocalDebugger + + + 21 47 + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json new file mode 100644 index 0000000000..c382d764e1 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json @@ -0,0 +1,29 @@ +{ + "guid": "4D5B57B8-6F34-4A11-89F5-3F07E766DB39", + "name": "bitonic-sort", + "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ], + "description": "Bitonic Sort using Intel® oneAPI DPC++ Language", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild bitonic-sort.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "bitonic-sort.exe 21 47" + ] + }] + } +} diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp new file mode 100644 index 0000000000..e0e4312520 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp @@ -0,0 +1,253 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Bitonic Sort: this algorithm converts a randomized sequence of numbers into +// a bitonic sequence (two ordered sequences), and then merge these two ordered +// sequences into a ordered sequence. Bitonic sort algorithm is briefly +// described as followed: +// +// - First, it decomposes the randomized sequence of size 2**n into 2**(n-1) +// pairs where each pair consists of 2 consecutive elements. Note that each pair +// is a bitonic sequence. +// - Step 0: for each pair (sequence of size 2), the two elements are swapped so +// that the two consecutive pairs form a bitonic sequence in increasing order, +// the next two pairs form the second bitonic sequence in decreasing order, the +// next two pairs form the third bitonic sequence in increasing order, etc, ... +// . At the end of this step, we have 2**(n-1) bitonic sequences of size 2, and +// they follow an order increasing, decreasing, increasing, .., decreasing. +// Thus, they form 2**(n-2) bitonic sequences of size 4. +// - Step 1: for each new 2**(n-2) bitonic sequences of size 4, (each new +// sequence consists of 2 consecutive previous sequences), it swaps the elements +// so that at the end of step 1, we have 2**(n-2) bitonic sequences of size 4, +// and they follow an order: increasing, decreasing, increasing, ..., +// decreasing. Thus, they form 2**(n-3) bitonic sequences of size 8. +// - Same logic applies until we reach the last step. +// - Step n: at this last step, we have one bitonic sequence of size 2**n. The +// elements in the sequence are swapped until we have a sequence in increasing +// oder. +// +// In this implementation, a randomized sequence of size 2**n is given (n is a +// positive number). Unified Shared Memory (USM) is used for data management. At +// each stage, a part of step, the host redefines the ordered sequenes and sends +// data to the kernel. The kernel swaps the elements accordingly in parallel. +// +#include +#include +#include + +using namespace sycl; +using namespace std; + +void ParallelBitonicSort(int a[], int n, queue &q) { + // n: the exponent used to set the array size. Array size = power(2, n) + int size = pow(2, n); + + // step from 0, 1, 2, ...., n-1 + for (int step = 0; step < n; step++) { + // for each step s, stage goes s, s-1, ..., 0 + for (int stage = step; stage >= 0; stage--) { + // In each state, construct a number (num_seq) of bitonic sequences of + // size seq_len (2, 4, ...) num_seq stores the number of bitonic sequences + // at each stage. seq_len stores the length of the bitonic sequence at + // each stage. + int seq_len = pow(2, stage + 1); +#if DEBUG + int num_seq = pow(2, (n - stage - 1)); // Used for debug purpose. + std::cout << "step num:" << step << " stage num:" << stage + << " num_seq:" << num_seq << "(" << seq_len << ") => "; +#endif + // Constant used in the kernel: 2**(step-stage). + int two_power = 1 << (step - stage); + + // Offload the work to kernel. + q.submit([&](handler &h) { + h.parallel_for(range<1>(size), [=](id<1> i) { + // Assign the bitonic sequence number. + int seq_num = i / seq_len; + + // Variable used to identified the swapped element. + int swapped_ele = -1; + + // Because the elements in the first half in the bitonic + // sequence may swap with elements in the second half, + // only the first half of elements in each sequence is + // required (seq_len/2). + int h_len = seq_len / 2; + + if (i < (seq_len * seq_num) + h_len) swapped_ele = i + h_len; + + // Check whether increasing or decreasing order. + int odd = seq_num / two_power; + + // Boolean variable used to determine "increasing" or + // "decreasing" order. + bool increasing = ((odd % 2) == 0); + + // Swap the elements in the bitonic sequence if needed + if (swapped_ele != -1) { + if (((a[i] > a[swapped_ele]) && increasing) || + ((a[i] < a[swapped_ele]) && !increasing)) { + int temp = a[i]; + a[i] = a[swapped_ele]; + a[swapped_ele] = temp; + } + } + }); + }); + q.wait(); + } // end stage + } // end step +} + +// Loop over the bitonic sequences at each stage in serial. +void SwapElements(int step, int stage, int num_sequence, int seq_len, + int *array) { + for (int seq_num = 0; seq_num < num_sequence; seq_num++) { + int odd = seq_num / (pow(2, (step - stage))); + bool increasing = ((odd % 2) == 0); + + int h_len = seq_len / 2; + + // For all elements in a bitonic sequence, swap them if needed + for (int i = seq_num * seq_len; i < seq_num * seq_len + h_len; i++) { + int swapped_ele = i + h_len; + + if (((array[i] > array[swapped_ele]) && increasing) || + ((array[i] < array[swapped_ele]) && !increasing)) { + int temp = array[i]; + array[i] = array[swapped_ele]; + array[swapped_ele] = temp; + } + } // end for all elements in a sequence + } // end all sequences +} + +// Function sorts an array in serial using bitonic sort algorithm. The size of +// the array is indicated by the exponent n: the array size is 2 ** n. +inline void BitonicSort(int a[], int n) { + // n: the exponent indicating the array size = 2 ** n. + + // step from 0, 1, 2, ...., n-1 + for (int step = 0; step < n; step++) { + // for each step s, stage goes s, s-1,..., 0 + for (int stage = step; stage >= 0; stage--) { + // Sequences (same size) are formed at each stage. + int num_sequence = pow(2, (n - stage - 1)); + // The length of the sequences (2, 4, ...). + int sequence_len = pow(2, stage + 1); + + SwapElements(step, stage, num_sequence, sequence_len, a); + } + } +} + +// Function showing the array. +void DisplayArray(int a[], int array_size) { + for (int i = 0; i < array_size; ++i) std::cout << a[i] << " "; + std::cout << "\n"; +} + +void Usage(std::string prog_name, int exponent) { + std::cout << " Incorrect parameters\n"; + std::cout << " Usage: " << prog_name << " n k \n\n"; + std::cout << " n: Integer exponent presenting the size of the input array. " + "The number of element in\n"; + std::cout << " the array must be power of 2 (e.g., 1, 2, 4, ...). Please " + "enter the corresponding\n"; + std::cout << " exponent betwwen 0 and " << exponent - 1 << ".\n"; + std::cout << " k: Seed used to generate a random sequence.\n"; +} + +int main(int argc, char *argv[]) { + int n, seed, size; + int exp_max = log2(std::numeric_limits::max()); + + // Read parameters. + try { + n = std::stoi(argv[1]); + + // Verify the boundary of acceptance. + if (n < 0 || n >= exp_max) { + Usage(argv[0], exp_max); + return -1; + } + + seed = std::stoi(argv[2]); + size = pow(2, n); + } catch (...) { + Usage(argv[0], exp_max); + return -1; + } + + std::cout << "\nArray size: " << size << ", seed: " << seed << "\n"; + + // Create queue on implementation-chosen default device. + queue q; + + std::cout << "Device: " << q.get_device().get_info() + << "\n"; + + // USM allocation using malloc_shared: data stores a sequence of random + // numbers. + int *data = malloc_shared(size, q); + + // Memory allocated for host access only. + int *data2 = (int *)malloc(size * sizeof(int)); + + // Initialize the array randomly using a seed. + srand(seed); + + for (int i = 0; i < size; i++) data[i] = data2[i] = rand() % 1000; + +#if DEBUG + std::cout << "\ndata before:\n"; + DisplayArray(data, size); +#endif + + // Start timer + dpc_common::TimeInterval t_par; + + ParallelBitonicSort(data, n, q); + + std::cout << "Kernel time: " << t_par.Elapsed() << " sec\n"; + +#if DEBUG + std::cout << "\ndata after sorting using parallel bitonic sort:\n"; + DisplayArray(data, size); +#endif + + // Start timer + dpc_common::TimeInterval t_ser; + + // Bitonic sort in CPU (serial) + BitonicSort(data2, n); + + std::cout << "CPU serial time: " << t_ser.Elapsed() << " sec\n"; + + // Verify both bitonic sort algorithms in kernel and in CPU. + bool pass = true; + for (int i = 0; i < size - 1; i++) { + // Validate the sequence order is increasing in both kernel and CPU. + if ((data[i] > data[i + 1]) || (data[i] != data2[i])) { + pass = false; + break; + } + } + + // Clean USM resources. + free(data, q); + + // Clean CPU memory. + free(data2); + + if (!pass) { + std::cout << "\nFailed!\n"; + return -2; + } + + std::cout << "\nSuccess!\n"; + return 0; +} From d3ab2190a761fe04d33139ebd8e041228ce28d2d Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Tue, 14 Jul 2020 11:43:03 -0700 Subject: [PATCH 02/23] Add a note about common file in README. Signed-off-by: Loc Nguyen --- DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md index 8066fd1f31..190b5874d0 100644 --- a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md @@ -58,6 +58,9 @@ This code sample is licensed under MIT license ## Building the `bitonic-sort` Program for CPU and GPU +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + ### Running Samples In DevCloud If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI From 8818264398babb35c72fd30fa44d2d68b6cbcbe4 Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Tue, 14 Jul 2020 14:16:39 -0700 Subject: [PATCH 03/23] Move 1d_HeatTransfer sample to open source GitHub. Signed-off-by: Loc Nguyen --- .../1d_HeatTransfer/1d_HeatTransfer.sln | 25 ++ .../1d_HeatTransfer/1d_HeatTransfer.vcxproj | 149 +++++++++ .../1d_HeatTransfer.vcxproj.filters | 22 ++ .../1d_HeatTransfer.vcxproj.user | 11 + .../1d_HeatTransfer/CMakeLists.txt | 27 ++ .../1d_HeatTransfer/License.txt | 7 + .../StructuredGrids/1d_HeatTransfer/README.md | 120 ++++++++ .../1d_HeatTransfer/sample.json | 29 ++ .../1d_HeatTransfer/src/1d_HeatTransfer.cpp | 288 ++++++++++++++++++ 9 files changed, 678 insertions(+) create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.sln create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj.filters create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj.user create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/CMakeLists.txt create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/License.txt create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/README.md create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/sample.json create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.sln b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.sln new file mode 100644 index 0000000000..b127a6bb7d --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.1062 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "1d_HeatTransfer", "1d_HeatTransfer.vcxproj", "{D3AB428E-A631-4BA1-A526-9A05B7B8E8CE}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {D3AB428E-A631-4BA1-A526-9A05B7B8E8CE}.Debug|x64.ActiveCfg = Debug|x64 + {D3AB428E-A631-4BA1-A526-9A05B7B8E8CE}.Debug|x64.Build.0 = Debug|x64 + {D3AB428E-A631-4BA1-A526-9A05B7B8E8CE}.Release|x64.ActiveCfg = Release|x64 + {D3AB428E-A631-4BA1-A526-9A05B7B8E8CE}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {81F9E738-2753-4252-82FA-AE9A402B7ABD} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj new file mode 100644 index 0000000000..474bc4dd1e --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj @@ -0,0 +1,149 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + 15.0 + {d3ab428e-a631-4ba1-a526-9a05b7b8e8ce} + Win32Proj + _1d_HeatTransfer + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj.filters b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj.filters new file mode 100644 index 0000000000..8efae7e89b --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj.filters @@ -0,0 +1,22 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj.user b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj.user new file mode 100644 index 0000000000..1bfa967dac --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj.user @@ -0,0 +1,11 @@ + + + + 100 1000 + WindowsLocalDebugger + + + 100 1000 + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/CMakeLists.txt b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/CMakeLists.txt new file mode 100644 index 0000000000..5efeb945b3 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/CMakeLists.txt @@ -0,0 +1,27 @@ +# required cmake version +cmake_minimum_required(VERSION 3.5) + +# CMakeLists.txt for 1d_HeatTransfer project +project (1d_HeatTransfer) + +set(CMAKE_CXX_COMPILER "dpcpp") + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17") + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl") + +add_executable (1d_HeatTransfer src/1d_HeatTransfer.cpp) + +add_custom_target (run + COMMAND 1d_HeatTransfer 100 1000 + WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} +) + diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/License.txt b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/License.txt new file mode 100644 index 0000000000..0578223382 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/License.txt @@ -0,0 +1,7 @@ +Copyright 2019 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/README.md b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/README.md new file mode 100644 index 0000000000..6459b25e05 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/README.md @@ -0,0 +1,120 @@ +# 1D-Heat-Transfer Sample + +This code sample demonstrates the simulation of one dimensional heat transfer process using +Intel Data Parallel C++. Kernels in this example are implemented as a discretized differential +equation with second derivative in space and first derivative in time. + +For comprehensive instructions regarding DPC++ Programming, go to +https://software.intel.com/en-us/oneapi-programming-guide +and search based on relevant terms noted in the comments. + +| Optimized for | Description +|:--- |:--- +| OS | Linux Ubuntu 18.04 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) +| What you will learn | How to simulate 1D Heat Transfer using Intel DPC++ compiler +| Time to complete | 10 minutes + + +## Purpose + +1D-Heat-Transfer is a DPC++ application that simulates the heat propagation on a one-dimensional +isotropic and homogeneous medium. The following equation is used in the simulation of heat propagation: + +dU/dt = k * d2U/dx2 + +Where: +dU/dt is the rate of change of temperature at a point. +k is the thermal difusivity. +d2U/dx2 is the second spatial derivative. + +Or + +U(i) = C * (U(i+1) - 2 * U(i) + U(i-1)) + U(i) + +where constant C = k * dt / (dx * dx) + +The code sample includes both parallel and serial calculation of heat propagation. The code sample will +attempt first to execute on an available GPU and fallback to the system's CPU if a compatible GPU is +not detected. The results are stored in a file. + + +## Key Implementation Details + +The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command groups. + + +## License + +This code sample is licensed under MIT license + + +## Building the `1d_HeatTransfer` Program for CPU and GPU + +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system". + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, +FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI +Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System + 1. Build the program using the following `cmake` commands. + ``` + $ cd 1d_HeatTransfer + $ mkdir build + $ cd build + $ cmake .. + $ make -j + ``` + + 2. Run the program + + ``` + make run + ``` + + 3. Clean the program + + ``` + make clean + ``` + +### On a Windows* System + * Build the program using VS2017 or VS2019 + Right click on the solution file and open using either VS2017 or VS2019 IDE. + Right click on the project in Solution explorer and select Rebuild. + From top menu select Debug -> Start without Debugging. + + * Build the program using MSBuild + Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for + VS2019" + Run - MSBuild 1d_HeatTransfer.sln /t:Rebuild /p:Configuration="Release" + + +## Running the sample +### Application Parameters + + Usage: 1d_HeatTransfer + +where + +n is the number of points you want to simulate the heat transfer. + +i is the number of timesteps in the simulation. + +The sample offloads the computation to GPU and also performs a computation in serial in the CPU. +The results from the parallel and serial computation are compared and stored in an output file named "error_diff.txt". +If the results match, the application will display a “PASSED” message. + +### Example of Output +``` +$ ./1d_HeatTransfer 100 1000 +Number of points: 100 +Number of iterations: 1000 +Kernel runs on Intel(R) Gen9 HD Graphics NEO +Kernel time: 0.347854 sec +PASSED! There is no difference between the results computed in host and in kernel. +``` diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/sample.json b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/sample.json new file mode 100644 index 0000000000..4346f22435 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/sample.json @@ -0,0 +1,29 @@ +{ + "guid": "CDF82E46-91D4-46F6-B1E2-5C21443F41BC", + "name": "1d_HeatTransfer", + "categories": [ "Toolkit/Intel® oneAPI HPC Toolkit" ], + "description": "1D_HEAT_TRANSFER: Simulating 1D Heat Transfer problem Using Intel® oneAPI DPC++ Language", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild 1d_HeatTransfer.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "1d_HeatTransfer.exe 100 1000" + ] + }] + } +} diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp new file mode 100644 index 0000000000..83f3405ba3 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp @@ -0,0 +1,288 @@ +//============================================================== +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// 1D HEAT TRANSFER: Using Intel® oneAPI DPC++ Language to simulate 1D Heat +// Transfer. +// +// The code sample simulates the heat propagation according to the following +// equation (case where there is no heat generation): +// +// dU/dt = k * d2U/dx2 +// (u(x,t+DT) - u(x,t)) / DT = k * (u(x+DX,t)- 2u(x,t) + u(x-DX,t)) / DX2 +// U(i) = C * (U(i+1) - 2 * U(i) + U(i-1)) + U(i) +// +// where constant C = k * dt / (dx * dx) +// +// For comprehensive instructions regarding DPC++ Programming, go to +// https://software.intel.com/en-us/oneapi-programming-guide +// and search based on relevant terms noted in the comments. +// +// DPC++ material used in this code sample: +// +// Basic structures of DPC++: +// DPC++ Queues (including device selectors and exception handlers) +// DPC++ Buffers and accessors (communicate data between the host and the +// device) +// DPC++ Kernels (including parallel_for function and range<1> objects) +// +//****************************************************************************** +// Content: (version 1.1) +// 1d_HeatTransfer.cpp +// +//****************************************************************************** +#include +#include +#include +#include +#include + +using namespace sycl; + +constexpr float dt = 0.002f; +constexpr float dx = 0.01f; +constexpr float k = 0.025f; +constexpr float temp = 100.0f; // Initial temperature. + +//************************************ +// Function description: display input parameters used for this sample. +//************************************ +void Usage(std::string programName) { + std::cout << " Incorrect parameters \n"; + std::cout << " Usage: "; + std::cout << programName << " \n\n"; + std::cout << " n : Number of points to simulate \n"; + std::cout << " i : Number of timesteps \n"; +} + +//************************************ +// Function description: initialize the array. +//************************************ +void Initialize(float* array, unsigned int num, unsigned int idx) { + for (unsigned int i = idx; i < num; i++) array[i] = 0.0f; +} + +//************************************ +// Function description: compute the heat in the device (in parallel). +//************************************ +float* ComputeHeatDeviceParallel(float* arr, float* arr_next, float C, + unsigned int num_p, unsigned int num_iter, + float temp) { + unsigned int i; + + try { + // Define the device queue + queue q = default_selector{}; + std::cout << "Kernel runs on " + << q.get_device().get_info() << "\n"; + + // Set boundary condition at one end. + arr[0] = arr_next[0] = temp; + + float* current_data_ptr = arr; + float* next_data_ptr = arr_next; + // current_data_ptr = arr; + // next_data_ptr = arr_next; + + // Buffer scope + { + buffer arr_buf(current_data_ptr, range<1>{num_p + 2}); + buffer arr_next_buf(next_data_ptr, range<1>{num_p + 2}); + + // Iterate over timesteps + for (i = 1; i <= num_iter; i++) { + if (i % 2 != 0) { + q.submit([&](handler& h) { + // The size of memory amount that will be given to the buffer. + range<1> num_items{num_p + 2}; + + auto arr_acc = arr_buf.get_access(h); + auto arr_next_acc = + arr_next_buf.get_access(h); + + h.parallel_for(num_items, [=](id<1> k) { + size_t gid = k.get(0); + + if (gid == 0) { + } else if (gid == num_p + 1) { + arr_next_acc[k] = arr_acc[k - 1]; + } else { + arr_next_acc[k] = + C * (arr_acc[k + 1] - 2 * arr_acc[k] + arr_acc[k - 1]) + + arr_acc[k]; + } + }); // end parallel for loop in kernel1 + }); // end device queue + + } else { + q.submit([&](handler& h) { + // The size of memory amount that will be given to the buffer. + range<1> num_items{num_p + 2}; + + auto arr_acc = arr_buf.get_access(h); + auto arr_next_acc = + arr_next_buf.get_access(h); + + h.parallel_for(num_items, [=](id<1> k) { + size_t gid = k.get(0); + + if (gid == 0) { + } else if (gid == num_p + 1) { + arr_acc[k] = arr_next_acc[k - 1]; + } else { + arr_acc[k] = C * (arr_next_acc[k + 1] - 2 * arr_next_acc[k] + + arr_next_acc[k - 1]) + + arr_next_acc[k]; + } + }); // end parallel for loop in kernel2 + }); // end device queue + } // end if %2 + } // end iteration + } // end buffer scope + + q.wait_and_throw(); + + } catch (cl::sycl::exception e) { + std::cout << "SYCL exception caught: " << e.what() << "\n"; + } + + if (i % 2 != 0) + return arr; + else + return arr_next; +} + +//************************************ +// Function description: compute the heat in the host (in serial). +//************************************ +float* ComputeHeatHostSerial(float* arr, float* arr_next, float C, + unsigned int num_p, unsigned int num_iter, + float temp) { + unsigned int i, k; + float* swap; + + // Set initial condition + Initialize(arr, num_p + 2, 0); + Initialize(arr_next, num_p + 2, 0); + + // Set boundary condition at one end. + arr[0] = arr_next[0] = temp; + + // Iterate over timesteps + for (i = 1; i <= num_iter; i++) { + for (k = 1; k <= num_p; k++) + arr_next[k] = C * (arr[k + 1] - 2 * arr[k] + arr[k - 1]) + arr[k]; + + arr_next[num_p + 1] = arr[num_p]; + + // Swap the buffers at every iteration. + swap = arr; + arr = arr_next; + arr_next = swap; + } + + return arr; +} + +//************************************ +// Function description: calculate the results computed by the host and by the +// device. +//************************************ +bool CompareResults(float* device_results, float* host_results, + unsigned int num_point, float C) { + float delta = 0.001f; + float difference = 0.00f; + double norm2 = 0; + bool err = false; + + std::ofstream err_file; + err_file.open("error_diff.txt"); + + err_file << " \t idx\theat[i]\t\theat_CPU[i] \n"; + + for (unsigned int i = 0; i < num_point + 2; i++) { + err_file << "\n RESULT: " << i << "\t" << std::setw(12) << std::left + << device_results[i] << "\t" << host_results[i]; + + difference = fabsf(host_results[i] - device_results[i]); + norm2 += difference * difference; + + if (difference > delta) { + err = true; + err_file << ", diff: " << difference; + } + } + + return err; +} + +int main(int argc, char* argv[]) { + unsigned int n_point; // The number of point in 1D space + unsigned int + n_iteration; // The number of iteration to simulate the heat propagation + + // Read input parameters + try { + n_point = std::stoi(argv[1]); + n_iteration = std::stoi(argv[2]); + + } catch (...) { + Usage(argv[0]); + return (-1); + } + + std::cout << "Number of points: " << n_point << "\n"; + std::cout << "Number of iterations: " << n_iteration << "\n"; + + // Array heat and heat_next arrays store temperatures of the current and next + // iteration of n_point (calculated in kernel) + float* heat = new float[n_point + 2]; + float* heat_next = new float[n_point + 2]; + + // heat_CPU and heat_next_CPU store temperatures of the current and next + // iteration of n_point (calculated in CPU or comparison) + float* heat_CPU = new float[n_point + 2]; + float* heat_CPU_next = new float[n_point + 2]; + + // Constant used in the simulation + float C = (k * dt) / (dx * dx); + + // Heat initial condition at t = 0 + Initialize(heat, n_point + 2, 0); + Initialize(heat_next, n_point + 2, 0); + + // Start timer + dpc_common::TimeInterval t_par; + + float* final_device = + ComputeHeatDeviceParallel(heat, heat_next, C, n_point, n_iteration, temp); + + // Display time used by device + std::cout << "Kernel time: " << t_par.Elapsed() << " sec\n"; + + // Compute heat in CPU in (for comparision) + float* final_CPU = NULL; + + final_CPU = ComputeHeatHostSerial(heat_CPU, heat_CPU_next, C, n_point, + n_iteration, temp); + + // Compare the results computed in device (in parallel) and in host (in + // serial) + bool err = CompareResults(final_device, final_CPU, n_point, C); + + if (err == true) + std::cout << "Please check the error_diff.txt file ...\n"; + else + std::cout << "PASSED! There is no difference between the results computed " + "in host and in kernel.\n"; + + // Cleanup + delete[] heat; + delete[] heat_next; + delete[] heat_CPU; + delete[] heat_CPU_next; + + return 0; +} From 306e10bd00ab743945fd76c6dd0e6b4e5c54f815 Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Fri, 14 Aug 2020 10:15:31 -0700 Subject: [PATCH 04/23] Updating License file to remove date --- DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt index 6e9524bd74..415025cf03 100644 --- a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt @@ -1,4 +1,4 @@ -Copyright 2020 Intel Corporation +Copyright Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: From d8b1c57c7cb33a22a669a2a9a01d6f9853eb93fb Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Fri, 14 Aug 2020 10:27:06 -0700 Subject: [PATCH 05/23] Adding Buffer Object approach. --- .../GraphTraversal/bitonic-sort/README.md | 9 +- .../bitonic-sort/src/bitonic-sort.cpp | 138 ++++++++++++++---- 2 files changed, 114 insertions(+), 33 deletions(-) diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md index 061f753ed0..0777dc2c0b 100644 --- a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md @@ -51,7 +51,7 @@ if a compatible GPU is not detected. ## Key Implementation Details The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command g -roups. Unified Shared Memory (USM) is used for data management. +roups. Unified Shared Memory (USM) and Buffer Object are used for data management. ## License This code sample is licensed under MIT license @@ -117,7 +117,10 @@ the ascending order is verified, the application will display a “Success!” m $ ./bitonic-sort 21 47 Array size: 2097152, seed: 47 Device: Intel(R) Gen9 HD Graphics NEO -Kernel time: 0.416827 sec -CPU serial time: 0.60523 sec +Warm up ... +Kernel time using USM: 0.248422 sec +Kernel time using buffer allocation: 0.253364 sec +CPU serial time: 0.628803 sec + Success! ``` diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp index e0e4312520..07dad3e0f0 100644 --- a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp @@ -35,38 +35,90 @@ // data to the kernel. The kernel swaps the elements accordingly in parallel. // #include -#include #include +#include "dpc_common.hpp" using namespace sycl; using namespace std; -void ParallelBitonicSort(int a[], int n, queue &q) { +#define DEBUG 0 + +void ParallelBitonicSort(int data_gpu[], int n, queue &q) { + // n: the exponent used to set the array size. Array size = power(2, n) + int size = pow(2, n); + int* a = data_gpu; + + // step from 0, 1, 2, ...., n-1 + for (int step = 0; step < n; step++) { + // for each step s, stage goes s, s-1, ..., 0 + for (int stage = step; stage >= 0; stage--) { + int seq_len = pow(2, stage + 1); + + // Constant used in the kernel: 2**(step-stage). + int two_power = 1 << (step - stage); + + // Offload the work to kernel. + q.submit([&](handler &h) { + h.parallel_for(range<1>(size), [=](id<1> i) { + // Assign the bitonic sequence number. + int seq_num = i / seq_len; + + // Variable used to identified the swapped element. + int swapped_ele = -1; + + // Because the elements in the first half in the bitonic + // sequence may swap with elements in the second half, + // only the first half of elements in each sequence is + // required (seq_len/2). + int h_len = seq_len / 2; + + if (i < (seq_len * seq_num) + h_len) swapped_ele = i + h_len; + + // Check whether increasing or decreasing order. + int odd = seq_num / two_power; + + // Boolean variable used to determine "increasing" or + // "decreasing" order. + bool increasing = ((odd % 2) == 0); + + // Swap the elements in the bitonic sequence if needed + if (swapped_ele != -1) { + if (((a[i] > a[swapped_ele]) && increasing) || + ((a[i] < a[swapped_ele]) && !increasing)) { + int temp = a[i]; + a[i] = a[swapped_ele]; + a[swapped_ele] = temp; + } + } + }); + }); + q.wait(); + } // end stage + } // end step +} + +void ParallelBitonicSortBuffer(int data_gpu[], int n, queue &q) { // n: the exponent used to set the array size. Array size = power(2, n) int size = pow(2, n); + buffer input (data_gpu, size); + // step from 0, 1, 2, ...., n-1 for (int step = 0; step < n; step++) { // for each step s, stage goes s, s-1, ..., 0 for (int stage = step; stage >= 0; stage--) { - // In each state, construct a number (num_seq) of bitonic sequences of - // size seq_len (2, 4, ...) num_seq stores the number of bitonic sequences - // at each stage. seq_len stores the length of the bitonic sequence at - // each stage. int seq_len = pow(2, stage + 1); -#if DEBUG - int num_seq = pow(2, (n - stage - 1)); // Used for debug purpose. - std::cout << "step num:" << step << " stage num:" << stage - << " num_seq:" << num_seq << "(" << seq_len << ") => "; -#endif + // Constant used in the kernel: 2**(step-stage). int two_power = 1 << (step - stage); // Offload the work to kernel. q.submit([&](handler &h) { - h.parallel_for(range<1>(size), [=](id<1> i) { + auto a = input.get_access(h); + + h.parallel_for(range<1>(size), [=](id<1> i) { // Assign the bitonic sequence number. - int seq_num = i / seq_len; + int seq_num = i / seq_len; // Variable used to identified the swapped element. int swapped_ele = -1; @@ -190,40 +242,62 @@ int main(int argc, char *argv[]) { std::cout << "Device: " << q.get_device().get_info() << "\n"; + // Memory allocated for host access only. + int *data_cpu = (int *)malloc(size * sizeof(int)); + // USM allocation using malloc_shared: data stores a sequence of random // numbers. - int *data = malloc_shared(size, q); + int *data_usm = malloc_shared(size, q); - // Memory allocated for host access only. - int *data2 = (int *)malloc(size * sizeof(int)); + // Memory allocated to store gpu results using buffer allocation + int *data_gpu = (int *)malloc(size * sizeof(int)); // Initialize the array randomly using a seed. srand(seed); - for (int i = 0; i < size; i++) data[i] = data2[i] = rand() % 1000; + for (int i = 0; i < size; i++) + data_usm[i] = data_gpu[i] = data_cpu[i] = rand() % 1000; #if DEBUG std::cout << "\ndata before:\n"; - DisplayArray(data, size); + DisplayArray(data_usm, size); #endif + // Warm up + std::cout << "Warm up ...\n"; + ParallelBitonicSort(data_usm, n, q); + // Start timer dpc_common::TimeInterval t_par; - ParallelBitonicSort(data, n, q); + // Parallel sort using USM + ParallelBitonicSort(data_usm, n, q); - std::cout << "Kernel time: " << t_par.Elapsed() << " sec\n"; + std::cout << "Kernel time using USM: " << t_par.Elapsed() << " sec\n"; #if DEBUG - std::cout << "\ndata after sorting using parallel bitonic sort:\n"; - DisplayArray(data, size); + std::cout << "\ndata_usm after sorting using parallel bitonic sort:\n"; + DisplayArray(data_usm, size); #endif + // Start timer + dpc_common::TimeInterval t_par2; + + // Parallel sort using buffer allocation + ParallelBitonicSortBuffer(data_gpu, n, q); + + std::cout << "Kernel time using buffer allocation: " << t_par2.Elapsed() << " sec\n"; + +#if DEBUG + std::cout << "\ndata_gpu after sorting using parallel bitonic sort:\n"; + DisplayArray(data_gpu, size); +#endif + // Start timer dpc_common::TimeInterval t_ser; // Bitonic sort in CPU (serial) - BitonicSort(data2, n); + BitonicSort(data_cpu, n); std::cout << "CPU serial time: " << t_ser.Elapsed() << " sec\n"; @@ -231,18 +305,22 @@ int main(int argc, char *argv[]) { bool pass = true; for (int i = 0; i < size - 1; i++) { // Validate the sequence order is increasing in both kernel and CPU. - if ((data[i] > data[i + 1]) || (data[i] != data2[i])) { + if ((data_usm[i] > data_usm[i + 1]) || (data_usm[i] != data_cpu[i])) { pass = false; break; } + + if ((data_gpu[i] > data_gpu[i + 1]) || (data_gpu[i] != data_cpu[i])) { + pass = false; + break; + } } - // Clean USM resources. - free(data, q); - - // Clean CPU memory. - free(data2); - + // Clean resources. + free(data_cpu); + free(data_usm, q); + free(data_gpu); + if (!pass) { std::cout << "\nFailed!\n"; return -2; From 74e5ec0d90cca66e0ba5228d90f6c5a63fb8cb16 Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Fri, 14 Aug 2020 13:34:27 -0700 Subject: [PATCH 06/23] Add comment about the location of dpc_common.hpp. --- .../DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp index 07dad3e0f0..0153bf4cd1 100644 --- a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp @@ -36,6 +36,9 @@ // #include #include + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp #include "dpc_common.hpp" using namespace sycl; From e2e8d014d73ef99684c717bdb0ee55347715aea6 Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Fri, 14 Aug 2020 14:04:43 -0700 Subject: [PATCH 07/23] New sample: Prefix Sum. --- .../ParallelPatterns/PrefixSum/CMakeLists.txt | 30 +++ .../ParallelPatterns/PrefixSum/License.txt | 7 + .../ParallelPatterns/PrefixSum/PrefixSum.sln | 25 ++ .../PrefixSum/PrefixSum.vcxproj | 137 ++++++++++ .../PrefixSum/PrefixSum.vcxproj.filters | 22 ++ .../PrefixSum/PrefixSum.vcxproj.user | 11 + .../ParallelPatterns/PrefixSum/README.md | 124 +++++++++ .../ParallelPatterns/PrefixSum/sample.json | 29 +++ .../PrefixSum/src/PrefixSum.cpp | 239 ++++++++++++++++++ 9 files changed, 624 insertions(+) create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt new file mode 100644 index 0000000000..85fcec4963 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt @@ -0,0 +1,30 @@ +# required cmake version +cmake_minimum_required(VERSION 3.5) + +project (PrefixSum) + +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp") +else() + set(CMAKE_CXX_COMPILER "dpcpp") +endif() + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17") + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl") + +add_executable (PrefixSum src/PrefixSum.cpp) + +add_custom_target (run + COMMAND PrefixSum 21 47 + WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} +) + diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt new file mode 100644 index 0000000000..415025cf03 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln new file mode 100644 index 0000000000..3587a92e74 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.29926.136 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "PrefixSum", "PrefixSum.vcxproj", "{BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.ActiveCfg = Debug|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.Build.0 = Debug|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.ActiveCfg = Release|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {9B9594EB-112B-4FAE-AD1F-04BD8FF34B9F} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj new file mode 100644 index 0000000000..6a6309b96b --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj @@ -0,0 +1,137 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + 15.0 + {bc12abe6-7951-47d6-93dc-126f8a5fcfd2} + Win32Proj + PrefixSum + 10.0.17763.0 + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + + + + + Console + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include + + + Console + true + + + + + + + + + + + Console + true + true + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters new file mode 100644 index 0000000000..2003dce0f2 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters @@ -0,0 +1,22 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user new file mode 100644 index 0000000000..7288fa06dd --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user @@ -0,0 +1,11 @@ + + + + 21 47 + WindowsLocalDebugger + + + 21 47 + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md new file mode 100644 index 0000000000..6bbc2cfdfb --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md @@ -0,0 +1,124 @@ +# `Prefix Sum` sample + +This code sample demonstrates the implementation of parallel prefix sum using Intel Data Parallel C++ to +offload the computation to a GPU. In this implementation, a random sequence of 2**n elements is given +(n is a positive number) as input, the algorithm compute the prefix sum in parallel. The result sequence is +in ascending order. + +For comprehensive instructions regarding DPC++ Programming, go to +https://software.intel.com/en-us/oneapi-programming-guide +and search based on relevant terms noted in the comments. + +| Optimized for | Description +|:--- |:--- +| OS | Linux Ubuntu 18.04 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) +| What you will learn | Implement bitonic sort using Intel DPC++ compiler +| Time to complete | 15 minutes + + +## Purpose + +Given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and returns +a new sequence y0, y1, y2, ..., yn so that + +y0 = x0 +y1 = x0 + x1 +y2 = x0 + x1 + x2 +..... +yn = x0 + x1 + x2 + ... + xn + +Below is the pseudo code for computing prefix sum in parallel: + +n is power of 2 (1, 2, 4 , 8, 16, ...): + +for i from 0 to [log2 n] - 1 do + for j from 0 to (n-1) do in parallel + if j<2^i then + x_{j}^{i+1} <- x_{j}^{i}} + else + x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}} + +In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i. +Given n processors to perform each iteration of the inner loop in constant time, the algorithm +as a whole runs in O(log n) time, the number of iterations of the outer loop. + +The code will attempt first to execute on an available GPU and fallback to the system's CPU if a +compatible GPU is not detected. + +## Key Implementation Details + +The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command +groups. + +## License +This code sample is licensed under MIT license + +## Building the `PrefixSum` Program for CPU and GPU + +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, +FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI +Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System +1. Build the program using the following `cmake` commands. + ``` + $ cd PrefixSum + $ mkdir build + $ cd build + $ cmake .. + $ make + ``` + +2. Run the program: + ``` + make run + ``` + +3. Clean the program using: + ``` + make clean + ``` + +### On a Windows* System + * Build the program using VS2017 or VS2019 + Right click on the solution file and open using either VS2017 or VS2019 IDE. + Right click on the project in Solution explorer and select Rebuild. + From top menu select Debug -> Start without Debugging. + + * Build the program using MSBuild + Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for + VS2019" + Run - MSBuild PrefixSum.sln /t:Rebuild /p:Configuration="Release" + +## Running the sample +### Application Parameters + + Usage: PrefixSum + +where + +exponent is a positive number. The according length of the sequence is 2**exponent. + +seed is the seed used by the random generator to generate the randomness. + +The sample offloads the computation to GPU and then performs the verification the results in the CPU. +The results are verified if yk = yk-1 + xk the original compared. If the results are matched and +the ascending order is verified, the application will display a “Success!” message. + +### Example of Output +``` +$ ./PrefixSum 21 47 + +Sequence size: 2097152, seed: 47 +Num iteration: 21 +Device: Intel(R) Gen9 HD Graphics NEO +Kernel time: 170 ms + +Success! +``` diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json new file mode 100644 index 0000000000..def268a2f8 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json @@ -0,0 +1,29 @@ +{ + "guid": "5D274319-02EE-44B0-B055-71E4C50D05E0", + "name": "PrefixSum", + "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ], + "description": "Compute Prefix Sum using Intel® oneAPI DPC++ Language", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild PrefixSum.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "PrefixSum.exe 21 47" + ] + }] + } +} diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp new file mode 100644 index 0000000000..b2af8367a7 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp @@ -0,0 +1,239 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// PrefixSum: this code sample implements the inclusive scan (prefix sum) in parallel. That +// is, given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and +// returns a new sequence y0, y1, y2, ..., yn so that +// +// y0 = x0 +// y1 = x0 + x1 +// y2 = x0 + x1 + x2 +// ..... +// yn = x0 + x1 + x2 + ... + xn +// +// Below is the pseudo code for computing prefix sum in parallel: +// +// n is power of 2 (1, 2, 4 , 8, 16, ...): +// +// for i from 0 to [log2 n] - 1 do +// for j from 0 to (n-1) do in parallel +// if j<2^i then +// x_{j}^{i+1} <- x_{j}^{i}} +// else +// x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}} +// +// In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i. +// Given n processors to perform each iteration of the inner loop in constant time, the algorithm as +// a whole runs in O(log n) time, the number of iterations of the outer loop. +// + +#include + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp +#include "dpc_common.hpp" + +using namespace sycl; +using namespace std; + +void Show(int a[], int arraysize) +{ + for (int i = 0; i < arraysize; ++i) + { + std::cout << a[i] << " "; + if ((i % 16) == 15) std::cout << "\n"; + } + + std::cout << "\n"; + return; +} + +int* ParallelPrefixSum(int* prefix1, int* prefix2, unsigned int nb, queue &q) +{ + unsigned int two_power = 1; + unsigned int num_iter = log2(nb); + //unsigned int uintmax = UINT_MAX; + int* result = NULL; + + // std::cout << "uintmax " << uintmax << " " << log2(uintmax) << "\n"; + // Buffer scope + { + buffer prefix1_buf(prefix1, range<1>{nb}); + buffer prefix2_buf(prefix2, range<1>{nb}); + + // Iterate over the necessary iterations. + for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) { + + // Submit command group for execution + q.submit([&](handler& h) { + // Create accessors + auto prefix1_acc = prefix1_buf.get_access(h); + auto prefix2_acc = prefix2_buf.get_access(h); + + if (iter % 2 == 0) { + h.parallel_for(range<1>(nb), [=](id<1> j) { + if (j < two_power) { + prefix2_acc[j] = prefix1_acc[j]; + } + else { + prefix2_acc[j] = prefix1_acc[j] + prefix1_acc[j - two_power]; + } + }); // end parallel for loop in kernel + result = prefix2; + //std::cout << "return prefix2\n"; + } + else { + h.parallel_for(range<1>(nb), [=](id<1> j) { + if (j < two_power) { + prefix1_acc[j] = prefix2_acc[j]; + } + else { + prefix1_acc[j] = prefix2_acc[j] + prefix2_acc[j - two_power]; + } + }); // end parallel for loop in kernel + result = prefix1; + //std::cout << "return prefix1\n"; + } + }); // end device queue + } // end iteration + } // Buffer scope + + // Wait for commands to complete. Enforce synchronization on the command queue + q.wait_and_throw(); + + return result; +} +/* +void PrefixSum(int* x, unsigned int nb) +{ + unsigned int two_power = 1; + unsigned int num_iter = log2(nb); + int temp = 0; + + // Iterate over the necessary iterations + for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) { + //Show(x, nb); + // std::cout << "two_power: " << two_power << "\n"; + for (unsigned int j = nb; j > 0; j--) { + if (j < two_power) { + x[j] = x[j]; + } + else { + x[j] = x[j] + x[j - two_power]; + } + } + } +} +*/ +void Usage(std::string prog_name, int exponent) { + std::cout << " Incorrect parameters\n"; + std::cout << " Usage: " << prog_name << " n k \n\n"; + std::cout << " n: Integer exponent presenting the size of the input array. The number of el\ +ement in\n"; + std::cout << " the array must be power of 2 (e.g., 1, 2, 4, ...). Please enter the corre\ +sponding\n"; + std::cout << " exponent betwwen 0 and " << exponent - 1 << ".\n"; + std::cout << " k: Seed used to generate a random sequence.\n"; +} + +int main(int argc, char* argv[]) { + unsigned int nb, seed; + int n, exp_max = log2(std::numeric_limits::max()); + + // Read parameters. + try { + n = std::stoi(argv[1]); + + // Verify the boundary of acceptance. + if (n < 0 || n >= exp_max) { + Usage(argv[0], exp_max); + return -1; + } + + seed = std::stoi(argv[2]); + nb = pow(2, n); + } catch (...) { + Usage(argv[0], exp_max); + return -1; + } + + std::cout << "\nSequence size: " << nb << ", seed: " << seed; + + int num_iter = log2(nb); + std::cout << "\nNum iteration: " << num_iter << "\n"; + + // Define device selector as 'default' + default_selector device_selector; + + // exception handler + auto exception_handler = [](exception_list exceptionList) { + for (std::exception_ptr const& e : exceptionList) { + try { + std::rethrow_exception(e); + } catch (cl::sycl::exception const& e) { + std::terminate(); + } + } + }; + + // Create a device queue using DPC++ class queue + queue q(device_selector, exception_handler); + + std::cout << "Device: " << q.get_device().get_info() << "\n"; + + int *data = new int[nb]; + int *prefix_sum1 = new int[nb]; + int *prefix_sum2 = new int[nb]; + int *result = NULL; + + srand(seed); + + // Initialize data arrays + for (int i = 0; i < nb; i++) { + data[i] = prefix_sum1[i] = rand() % 10; + prefix_sum2[i] = 0; + } + + // Start timer + auto start = std::chrono::steady_clock::now(); + + result = ParallelPrefixSum(prefix_sum1, prefix_sum2, nb, q); + + auto end = std::chrono::steady_clock::now(); + auto timeKern = std::chrono::duration_cast(end - start).count(); + std::cout << "Kernel time: " << timeKern << " ms" << "\n"; + + //std::cout << "\ndata after transforming using parallel prefix sum result:"; + //Show(result, nb); + + bool equal = true; + + if (result[0] != data[0]) + equal = false; + else + { + for (int i = 1; i < nb; i++) { + if (result[i] != result[i - 1] + data[i]) + { + equal = false; + break; + } + } + } + + delete[] data; + delete[] prefix_sum1; + delete[] prefix_sum2; + + if (!equal) { + std::cout << "\nFailed: " << std::endl; + return -2; + } + else { + std::cout << "\nSuccess!" << std::endl; + return 0; + } +} From 4f4a320e291d605fab1126efa3a03bcf1593374e Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Fri, 14 Aug 2020 14:16:22 -0700 Subject: [PATCH 08/23] Remove new sample. --- .../ParallelPatterns/PrefixSum/CMakeLists.txt | 30 --- .../ParallelPatterns/PrefixSum/License.txt | 7 - .../ParallelPatterns/PrefixSum/PrefixSum.sln | 25 -- .../PrefixSum/PrefixSum.vcxproj | 137 ---------- .../PrefixSum/PrefixSum.vcxproj.filters | 22 -- .../PrefixSum/PrefixSum.vcxproj.user | 11 - .../ParallelPatterns/PrefixSum/README.md | 124 --------- .../ParallelPatterns/PrefixSum/sample.json | 29 --- .../PrefixSum/src/PrefixSum.cpp | 239 ------------------ 9 files changed, 624 deletions(-) delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt deleted file mode 100644 index 85fcec4963..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt +++ /dev/null @@ -1,30 +0,0 @@ -# required cmake version -cmake_minimum_required(VERSION 3.5) - -project (PrefixSum) - -if(WIN32) - set(CMAKE_CXX_COMPILER "dpcpp") -else() - set(CMAKE_CXX_COMPILER "dpcpp") -endif() - -# Set default build type to RelWithDebInfo if not specified -if (NOT CMAKE_BUILD_TYPE) - message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") - set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE - STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" - FORCE) -endif() - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17") - -set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl") - -add_executable (PrefixSum src/PrefixSum.cpp) - -add_custom_target (run - COMMAND PrefixSum 21 47 - WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} -) - diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt deleted file mode 100644 index 415025cf03..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt +++ /dev/null @@ -1,7 +0,0 @@ -Copyright Intel Corporation - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln deleted file mode 100644 index 3587a92e74..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln +++ /dev/null @@ -1,25 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 16 -VisualStudioVersion = 16.0.29926.136 -MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "PrefixSum", "PrefixSum.vcxproj", "{BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|x64 = Debug|x64 - Release|x64 = Release|x64 - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.ActiveCfg = Debug|x64 - {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.Build.0 = Debug|x64 - {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.ActiveCfg = Release|x64 - {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.Build.0 = Release|x64 - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection - GlobalSection(ExtensibilityGlobals) = postSolution - SolutionGuid = {9B9594EB-112B-4FAE-AD1F-04BD8FF34B9F} - EndGlobalSection -EndGlobal diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj deleted file mode 100644 index 6a6309b96b..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj +++ /dev/null @@ -1,137 +0,0 @@ - - - - - Debug - x64 - - - Release - x64 - - - - - - - 15.0 - {bc12abe6-7951-47d6-93dc-126f8a5fcfd2} - Win32Proj - PrefixSum - 10.0.17763.0 - - - - Application - true - Intel(R) oneAPI DPC++ Compiler - Unicode - - - Application - false - Intel(R) oneAPI DPC++ Compiler - true - Unicode - - - Application - true - Intel(R) oneAPI DPC++ Compiler - Unicode - - - Application - false - Intel(R) oneAPI DPC++ Compiler - true - Unicode - - - - - - - - - - - - - - - - - - - - - true - - - true - - - false - - - false - - - - - - - - - - Console - true - - - - - - - - - %ONEAPI_ROOT%\dev-utilities\latest\include - - - Console - true - - - - - - - - - - - Console - true - true - true - - - - - - - - - %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) - - - Console - true - true - true - - - - - - \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters deleted file mode 100644 index 2003dce0f2..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters +++ /dev/null @@ -1,22 +0,0 @@ - - - - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hh;hpp;hxx;hm;inl;inc;ipp;xsd - - - {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} - rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms - - - - - Source Files - - - \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user deleted file mode 100644 index 7288fa06dd..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user +++ /dev/null @@ -1,11 +0,0 @@ - - - - 21 47 - WindowsLocalDebugger - - - 21 47 - WindowsLocalDebugger - - \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md deleted file mode 100644 index 6bbc2cfdfb..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md +++ /dev/null @@ -1,124 +0,0 @@ -# `Prefix Sum` sample - -This code sample demonstrates the implementation of parallel prefix sum using Intel Data Parallel C++ to -offload the computation to a GPU. In this implementation, a random sequence of 2**n elements is given -(n is a positive number) as input, the algorithm compute the prefix sum in parallel. The result sequence is -in ascending order. - -For comprehensive instructions regarding DPC++ Programming, go to -https://software.intel.com/en-us/oneapi-programming-guide -and search based on relevant terms noted in the comments. - -| Optimized for | Description -|:--- |:--- -| OS | Linux Ubuntu 18.04 -| Hardware | Skylake with GEN9 or newer -| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) -| What you will learn | Implement bitonic sort using Intel DPC++ compiler -| Time to complete | 15 minutes - - -## Purpose - -Given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and returns -a new sequence y0, y1, y2, ..., yn so that - -y0 = x0 -y1 = x0 + x1 -y2 = x0 + x1 + x2 -..... -yn = x0 + x1 + x2 + ... + xn - -Below is the pseudo code for computing prefix sum in parallel: - -n is power of 2 (1, 2, 4 , 8, 16, ...): - -for i from 0 to [log2 n] - 1 do - for j from 0 to (n-1) do in parallel - if j<2^i then - x_{j}^{i+1} <- x_{j}^{i}} - else - x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}} - -In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i. -Given n processors to perform each iteration of the inner loop in constant time, the algorithm -as a whole runs in O(log n) time, the number of iterations of the outer loop. - -The code will attempt first to execute on an available GPU and fallback to the system's CPU if a -compatible GPU is not detected. - -## Key Implementation Details - -The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command -groups. - -## License -This code sample is licensed under MIT license - -## Building the `PrefixSum` Program for CPU and GPU - -### Include Files -The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. - -### Running Samples In DevCloud -If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, -FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI -Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) - -### On a Linux* System -1. Build the program using the following `cmake` commands. - ``` - $ cd PrefixSum - $ mkdir build - $ cd build - $ cmake .. - $ make - ``` - -2. Run the program: - ``` - make run - ``` - -3. Clean the program using: - ``` - make clean - ``` - -### On a Windows* System - * Build the program using VS2017 or VS2019 - Right click on the solution file and open using either VS2017 or VS2019 IDE. - Right click on the project in Solution explorer and select Rebuild. - From top menu select Debug -> Start without Debugging. - - * Build the program using MSBuild - Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for - VS2019" - Run - MSBuild PrefixSum.sln /t:Rebuild /p:Configuration="Release" - -## Running the sample -### Application Parameters - - Usage: PrefixSum - -where - -exponent is a positive number. The according length of the sequence is 2**exponent. - -seed is the seed used by the random generator to generate the randomness. - -The sample offloads the computation to GPU and then performs the verification the results in the CPU. -The results are verified if yk = yk-1 + xk the original compared. If the results are matched and -the ascending order is verified, the application will display a “Success!” message. - -### Example of Output -``` -$ ./PrefixSum 21 47 - -Sequence size: 2097152, seed: 47 -Num iteration: 21 -Device: Intel(R) Gen9 HD Graphics NEO -Kernel time: 170 ms - -Success! -``` diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json deleted file mode 100644 index def268a2f8..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "guid": "5D274319-02EE-44B0-B055-71E4C50D05E0", - "name": "PrefixSum", - "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ], - "description": "Compute Prefix Sum using Intel® oneAPI DPC++ Language", - "toolchain": [ "dpcpp" ], - "targetDevice": [ "CPU", "GPU" ], - "languages": [ { "cpp": {} } ], - "os": [ "linux", "windows" ], - "builder": [ "ide", "cmake" ], - "ciTests": { - "linux": [{ - "steps": [ - "mkdir build", - "cd build", - "cmake ..", - "make", - "make run" - ] - }], - "windows": [{ - "steps": [ - "MSBuild PrefixSum.sln /t:Rebuild /p:Configuration=\"Release\"", - "cd x64/Release", - "PrefixSum.exe 21 47" - ] - }] - } -} diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp deleted file mode 100644 index b2af8367a7..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp +++ /dev/null @@ -1,239 +0,0 @@ -//============================================================== -// Copyright © 2020 Intel Corporation -// -// SPDX-License-Identifier: MIT -// ============================================================= -// -// PrefixSum: this code sample implements the inclusive scan (prefix sum) in parallel. That -// is, given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and -// returns a new sequence y0, y1, y2, ..., yn so that -// -// y0 = x0 -// y1 = x0 + x1 -// y2 = x0 + x1 + x2 -// ..... -// yn = x0 + x1 + x2 + ... + xn -// -// Below is the pseudo code for computing prefix sum in parallel: -// -// n is power of 2 (1, 2, 4 , 8, 16, ...): -// -// for i from 0 to [log2 n] - 1 do -// for j from 0 to (n-1) do in parallel -// if j<2^i then -// x_{j}^{i+1} <- x_{j}^{i}} -// else -// x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}} -// -// In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i. -// Given n processors to perform each iteration of the inner loop in constant time, the algorithm as -// a whole runs in O(log n) time, the number of iterations of the outer loop. -// - -#include - -// dpc_common.hpp can be found in the dev-utilities include folder. -// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp -#include "dpc_common.hpp" - -using namespace sycl; -using namespace std; - -void Show(int a[], int arraysize) -{ - for (int i = 0; i < arraysize; ++i) - { - std::cout << a[i] << " "; - if ((i % 16) == 15) std::cout << "\n"; - } - - std::cout << "\n"; - return; -} - -int* ParallelPrefixSum(int* prefix1, int* prefix2, unsigned int nb, queue &q) -{ - unsigned int two_power = 1; - unsigned int num_iter = log2(nb); - //unsigned int uintmax = UINT_MAX; - int* result = NULL; - - // std::cout << "uintmax " << uintmax << " " << log2(uintmax) << "\n"; - // Buffer scope - { - buffer prefix1_buf(prefix1, range<1>{nb}); - buffer prefix2_buf(prefix2, range<1>{nb}); - - // Iterate over the necessary iterations. - for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) { - - // Submit command group for execution - q.submit([&](handler& h) { - // Create accessors - auto prefix1_acc = prefix1_buf.get_access(h); - auto prefix2_acc = prefix2_buf.get_access(h); - - if (iter % 2 == 0) { - h.parallel_for(range<1>(nb), [=](id<1> j) { - if (j < two_power) { - prefix2_acc[j] = prefix1_acc[j]; - } - else { - prefix2_acc[j] = prefix1_acc[j] + prefix1_acc[j - two_power]; - } - }); // end parallel for loop in kernel - result = prefix2; - //std::cout << "return prefix2\n"; - } - else { - h.parallel_for(range<1>(nb), [=](id<1> j) { - if (j < two_power) { - prefix1_acc[j] = prefix2_acc[j]; - } - else { - prefix1_acc[j] = prefix2_acc[j] + prefix2_acc[j - two_power]; - } - }); // end parallel for loop in kernel - result = prefix1; - //std::cout << "return prefix1\n"; - } - }); // end device queue - } // end iteration - } // Buffer scope - - // Wait for commands to complete. Enforce synchronization on the command queue - q.wait_and_throw(); - - return result; -} -/* -void PrefixSum(int* x, unsigned int nb) -{ - unsigned int two_power = 1; - unsigned int num_iter = log2(nb); - int temp = 0; - - // Iterate over the necessary iterations - for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) { - //Show(x, nb); - // std::cout << "two_power: " << two_power << "\n"; - for (unsigned int j = nb; j > 0; j--) { - if (j < two_power) { - x[j] = x[j]; - } - else { - x[j] = x[j] + x[j - two_power]; - } - } - } -} -*/ -void Usage(std::string prog_name, int exponent) { - std::cout << " Incorrect parameters\n"; - std::cout << " Usage: " << prog_name << " n k \n\n"; - std::cout << " n: Integer exponent presenting the size of the input array. The number of el\ -ement in\n"; - std::cout << " the array must be power of 2 (e.g., 1, 2, 4, ...). Please enter the corre\ -sponding\n"; - std::cout << " exponent betwwen 0 and " << exponent - 1 << ".\n"; - std::cout << " k: Seed used to generate a random sequence.\n"; -} - -int main(int argc, char* argv[]) { - unsigned int nb, seed; - int n, exp_max = log2(std::numeric_limits::max()); - - // Read parameters. - try { - n = std::stoi(argv[1]); - - // Verify the boundary of acceptance. - if (n < 0 || n >= exp_max) { - Usage(argv[0], exp_max); - return -1; - } - - seed = std::stoi(argv[2]); - nb = pow(2, n); - } catch (...) { - Usage(argv[0], exp_max); - return -1; - } - - std::cout << "\nSequence size: " << nb << ", seed: " << seed; - - int num_iter = log2(nb); - std::cout << "\nNum iteration: " << num_iter << "\n"; - - // Define device selector as 'default' - default_selector device_selector; - - // exception handler - auto exception_handler = [](exception_list exceptionList) { - for (std::exception_ptr const& e : exceptionList) { - try { - std::rethrow_exception(e); - } catch (cl::sycl::exception const& e) { - std::terminate(); - } - } - }; - - // Create a device queue using DPC++ class queue - queue q(device_selector, exception_handler); - - std::cout << "Device: " << q.get_device().get_info() << "\n"; - - int *data = new int[nb]; - int *prefix_sum1 = new int[nb]; - int *prefix_sum2 = new int[nb]; - int *result = NULL; - - srand(seed); - - // Initialize data arrays - for (int i = 0; i < nb; i++) { - data[i] = prefix_sum1[i] = rand() % 10; - prefix_sum2[i] = 0; - } - - // Start timer - auto start = std::chrono::steady_clock::now(); - - result = ParallelPrefixSum(prefix_sum1, prefix_sum2, nb, q); - - auto end = std::chrono::steady_clock::now(); - auto timeKern = std::chrono::duration_cast(end - start).count(); - std::cout << "Kernel time: " << timeKern << " ms" << "\n"; - - //std::cout << "\ndata after transforming using parallel prefix sum result:"; - //Show(result, nb); - - bool equal = true; - - if (result[0] != data[0]) - equal = false; - else - { - for (int i = 1; i < nb; i++) { - if (result[i] != result[i - 1] + data[i]) - { - equal = false; - break; - } - } - } - - delete[] data; - delete[] prefix_sum1; - delete[] prefix_sum2; - - if (!equal) { - std::cout << "\nFailed: " << std::endl; - return -2; - } - else { - std::cout << "\nSuccess!" << std::endl; - return 0; - } -} From 9de715d9102c6610259d8612feaa35d3943a172f Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Tue, 18 Aug 2020 15:13:37 -0700 Subject: [PATCH 09/23] New code sample PrefixSum in ParallelPatterns. Signed-off-by: Loc Nguyen --- .../ParallelPatterns/PrefixSum/CMakeLists.txt | 30 +++ .../ParallelPatterns/PrefixSum/License.txt | 7 + .../ParallelPatterns/PrefixSum/PrefixSum.sln | 25 ++ .../PrefixSum/PrefixSum.vcxproj | 137 ++++++++++ .../PrefixSum/PrefixSum.vcxproj.filters | 22 ++ .../PrefixSum/PrefixSum.vcxproj.user | 11 + .../ParallelPatterns/PrefixSum/README.md | 124 +++++++++ .../ParallelPatterns/PrefixSum/sample.json | 29 +++ .../PrefixSum/src/PrefixSum.cpp | 239 ++++++++++++++++++ 9 files changed, 624 insertions(+) create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt new file mode 100644 index 0000000000..85fcec4963 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt @@ -0,0 +1,30 @@ +# required cmake version +cmake_minimum_required(VERSION 3.5) + +project (PrefixSum) + +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp") +else() + set(CMAKE_CXX_COMPILER "dpcpp") +endif() + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17") + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl") + +add_executable (PrefixSum src/PrefixSum.cpp) + +add_custom_target (run + COMMAND PrefixSum 21 47 + WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} +) + diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt new file mode 100644 index 0000000000..415025cf03 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln new file mode 100644 index 0000000000..3587a92e74 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.29926.136 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "PrefixSum", "PrefixSum.vcxproj", "{BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.ActiveCfg = Debug|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.Build.0 = Debug|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.ActiveCfg = Release|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {9B9594EB-112B-4FAE-AD1F-04BD8FF34B9F} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj new file mode 100644 index 0000000000..6a6309b96b --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj @@ -0,0 +1,137 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + 15.0 + {bc12abe6-7951-47d6-93dc-126f8a5fcfd2} + Win32Proj + PrefixSum + 10.0.17763.0 + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + + + + + Console + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include + + + Console + true + + + + + + + + + + + Console + true + true + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters new file mode 100644 index 0000000000..2003dce0f2 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters @@ -0,0 +1,22 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user new file mode 100644 index 0000000000..7288fa06dd --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user @@ -0,0 +1,11 @@ + + + + 21 47 + WindowsLocalDebugger + + + 21 47 + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md new file mode 100644 index 0000000000..6bbc2cfdfb --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md @@ -0,0 +1,124 @@ +# `Prefix Sum` sample + +This code sample demonstrates the implementation of parallel prefix sum using Intel Data Parallel C++ to +offload the computation to a GPU. In this implementation, a random sequence of 2**n elements is given +(n is a positive number) as input, the algorithm compute the prefix sum in parallel. The result sequence is +in ascending order. + +For comprehensive instructions regarding DPC++ Programming, go to +https://software.intel.com/en-us/oneapi-programming-guide +and search based on relevant terms noted in the comments. + +| Optimized for | Description +|:--- |:--- +| OS | Linux Ubuntu 18.04 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) +| What you will learn | Implement bitonic sort using Intel DPC++ compiler +| Time to complete | 15 minutes + + +## Purpose + +Given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and returns +a new sequence y0, y1, y2, ..., yn so that + +y0 = x0 +y1 = x0 + x1 +y2 = x0 + x1 + x2 +..... +yn = x0 + x1 + x2 + ... + xn + +Below is the pseudo code for computing prefix sum in parallel: + +n is power of 2 (1, 2, 4 , 8, 16, ...): + +for i from 0 to [log2 n] - 1 do + for j from 0 to (n-1) do in parallel + if j<2^i then + x_{j}^{i+1} <- x_{j}^{i}} + else + x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}} + +In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i. +Given n processors to perform each iteration of the inner loop in constant time, the algorithm +as a whole runs in O(log n) time, the number of iterations of the outer loop. + +The code will attempt first to execute on an available GPU and fallback to the system's CPU if a +compatible GPU is not detected. + +## Key Implementation Details + +The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command +groups. + +## License +This code sample is licensed under MIT license + +## Building the `PrefixSum` Program for CPU and GPU + +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, +FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI +Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System +1. Build the program using the following `cmake` commands. + ``` + $ cd PrefixSum + $ mkdir build + $ cd build + $ cmake .. + $ make + ``` + +2. Run the program: + ``` + make run + ``` + +3. Clean the program using: + ``` + make clean + ``` + +### On a Windows* System + * Build the program using VS2017 or VS2019 + Right click on the solution file and open using either VS2017 or VS2019 IDE. + Right click on the project in Solution explorer and select Rebuild. + From top menu select Debug -> Start without Debugging. + + * Build the program using MSBuild + Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for + VS2019" + Run - MSBuild PrefixSum.sln /t:Rebuild /p:Configuration="Release" + +## Running the sample +### Application Parameters + + Usage: PrefixSum + +where + +exponent is a positive number. The according length of the sequence is 2**exponent. + +seed is the seed used by the random generator to generate the randomness. + +The sample offloads the computation to GPU and then performs the verification the results in the CPU. +The results are verified if yk = yk-1 + xk the original compared. If the results are matched and +the ascending order is verified, the application will display a “Success!” message. + +### Example of Output +``` +$ ./PrefixSum 21 47 + +Sequence size: 2097152, seed: 47 +Num iteration: 21 +Device: Intel(R) Gen9 HD Graphics NEO +Kernel time: 170 ms + +Success! +``` diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json new file mode 100644 index 0000000000..def268a2f8 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json @@ -0,0 +1,29 @@ +{ + "guid": "5D274319-02EE-44B0-B055-71E4C50D05E0", + "name": "PrefixSum", + "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ], + "description": "Compute Prefix Sum using Intel® oneAPI DPC++ Language", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild PrefixSum.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "PrefixSum.exe 21 47" + ] + }] + } +} diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp new file mode 100644 index 0000000000..b2af8367a7 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp @@ -0,0 +1,239 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// PrefixSum: this code sample implements the inclusive scan (prefix sum) in parallel. That +// is, given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and +// returns a new sequence y0, y1, y2, ..., yn so that +// +// y0 = x0 +// y1 = x0 + x1 +// y2 = x0 + x1 + x2 +// ..... +// yn = x0 + x1 + x2 + ... + xn +// +// Below is the pseudo code for computing prefix sum in parallel: +// +// n is power of 2 (1, 2, 4 , 8, 16, ...): +// +// for i from 0 to [log2 n] - 1 do +// for j from 0 to (n-1) do in parallel +// if j<2^i then +// x_{j}^{i+1} <- x_{j}^{i}} +// else +// x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}} +// +// In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i. +// Given n processors to perform each iteration of the inner loop in constant time, the algorithm as +// a whole runs in O(log n) time, the number of iterations of the outer loop. +// + +#include + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp +#include "dpc_common.hpp" + +using namespace sycl; +using namespace std; + +void Show(int a[], int arraysize) +{ + for (int i = 0; i < arraysize; ++i) + { + std::cout << a[i] << " "; + if ((i % 16) == 15) std::cout << "\n"; + } + + std::cout << "\n"; + return; +} + +int* ParallelPrefixSum(int* prefix1, int* prefix2, unsigned int nb, queue &q) +{ + unsigned int two_power = 1; + unsigned int num_iter = log2(nb); + //unsigned int uintmax = UINT_MAX; + int* result = NULL; + + // std::cout << "uintmax " << uintmax << " " << log2(uintmax) << "\n"; + // Buffer scope + { + buffer prefix1_buf(prefix1, range<1>{nb}); + buffer prefix2_buf(prefix2, range<1>{nb}); + + // Iterate over the necessary iterations. + for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) { + + // Submit command group for execution + q.submit([&](handler& h) { + // Create accessors + auto prefix1_acc = prefix1_buf.get_access(h); + auto prefix2_acc = prefix2_buf.get_access(h); + + if (iter % 2 == 0) { + h.parallel_for(range<1>(nb), [=](id<1> j) { + if (j < two_power) { + prefix2_acc[j] = prefix1_acc[j]; + } + else { + prefix2_acc[j] = prefix1_acc[j] + prefix1_acc[j - two_power]; + } + }); // end parallel for loop in kernel + result = prefix2; + //std::cout << "return prefix2\n"; + } + else { + h.parallel_for(range<1>(nb), [=](id<1> j) { + if (j < two_power) { + prefix1_acc[j] = prefix2_acc[j]; + } + else { + prefix1_acc[j] = prefix2_acc[j] + prefix2_acc[j - two_power]; + } + }); // end parallel for loop in kernel + result = prefix1; + //std::cout << "return prefix1\n"; + } + }); // end device queue + } // end iteration + } // Buffer scope + + // Wait for commands to complete. Enforce synchronization on the command queue + q.wait_and_throw(); + + return result; +} +/* +void PrefixSum(int* x, unsigned int nb) +{ + unsigned int two_power = 1; + unsigned int num_iter = log2(nb); + int temp = 0; + + // Iterate over the necessary iterations + for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) { + //Show(x, nb); + // std::cout << "two_power: " << two_power << "\n"; + for (unsigned int j = nb; j > 0; j--) { + if (j < two_power) { + x[j] = x[j]; + } + else { + x[j] = x[j] + x[j - two_power]; + } + } + } +} +*/ +void Usage(std::string prog_name, int exponent) { + std::cout << " Incorrect parameters\n"; + std::cout << " Usage: " << prog_name << " n k \n\n"; + std::cout << " n: Integer exponent presenting the size of the input array. The number of el\ +ement in\n"; + std::cout << " the array must be power of 2 (e.g., 1, 2, 4, ...). Please enter the corre\ +sponding\n"; + std::cout << " exponent betwwen 0 and " << exponent - 1 << ".\n"; + std::cout << " k: Seed used to generate a random sequence.\n"; +} + +int main(int argc, char* argv[]) { + unsigned int nb, seed; + int n, exp_max = log2(std::numeric_limits::max()); + + // Read parameters. + try { + n = std::stoi(argv[1]); + + // Verify the boundary of acceptance. + if (n < 0 || n >= exp_max) { + Usage(argv[0], exp_max); + return -1; + } + + seed = std::stoi(argv[2]); + nb = pow(2, n); + } catch (...) { + Usage(argv[0], exp_max); + return -1; + } + + std::cout << "\nSequence size: " << nb << ", seed: " << seed; + + int num_iter = log2(nb); + std::cout << "\nNum iteration: " << num_iter << "\n"; + + // Define device selector as 'default' + default_selector device_selector; + + // exception handler + auto exception_handler = [](exception_list exceptionList) { + for (std::exception_ptr const& e : exceptionList) { + try { + std::rethrow_exception(e); + } catch (cl::sycl::exception const& e) { + std::terminate(); + } + } + }; + + // Create a device queue using DPC++ class queue + queue q(device_selector, exception_handler); + + std::cout << "Device: " << q.get_device().get_info() << "\n"; + + int *data = new int[nb]; + int *prefix_sum1 = new int[nb]; + int *prefix_sum2 = new int[nb]; + int *result = NULL; + + srand(seed); + + // Initialize data arrays + for (int i = 0; i < nb; i++) { + data[i] = prefix_sum1[i] = rand() % 10; + prefix_sum2[i] = 0; + } + + // Start timer + auto start = std::chrono::steady_clock::now(); + + result = ParallelPrefixSum(prefix_sum1, prefix_sum2, nb, q); + + auto end = std::chrono::steady_clock::now(); + auto timeKern = std::chrono::duration_cast(end - start).count(); + std::cout << "Kernel time: " << timeKern << " ms" << "\n"; + + //std::cout << "\ndata after transforming using parallel prefix sum result:"; + //Show(result, nb); + + bool equal = true; + + if (result[0] != data[0]) + equal = false; + else + { + for (int i = 1; i < nb; i++) { + if (result[i] != result[i - 1] + data[i]) + { + equal = false; + break; + } + } + } + + delete[] data; + delete[] prefix_sum1; + delete[] prefix_sum2; + + if (!equal) { + std::cout << "\nFailed: " << std::endl; + return -2; + } + else { + std::cout << "\nSuccess!" << std::endl; + return 0; + } +} From e2e39f1e2c6cb4865c043e3b74896357af1bbdeb Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Thu, 20 Aug 2020 22:38:50 -0700 Subject: [PATCH 10/23] Integrate MPI code sample with dpc_reduce code sample. --- .../dpc_reduce/CMakeLists.txt | 36 ++- .../ParallelPatterns/dpc_reduce/README.md | 44 ++- .../ParallelPatterns/dpc_reduce/sample.json | 2 +- .../dpc_reduce/src/CMakeLists.txt | 24 -- .../ParallelPatterns/dpc_reduce/src/main.cpp | 306 +++++++++++++----- 5 files changed, 289 insertions(+), 123 deletions(-) delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/CMakeLists.txt diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt index f472928505..2be7bcff6a 100644 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt @@ -1,12 +1,24 @@ -set(CMAKE_CXX_COMPILER "dpcpp") -# Set default build type to RelWithDebInfo if not specified -if (NOT CMAKE_BUILD_TYPE) - message (STATUS "Default CMAKE_BUILD_TYPE not set using Release") - set (CMAKE_BUILD_TYPE "Release" CACHE - STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" - FORCE) -endif() - -cmake_minimum_required (VERSION 3.0) -project(dpc_reduce LANGUAGES CXX) -add_subdirectory (src) +# required cmake version +cmake_minimum_required(VERSION 3.5) + +project (dpc_reduce) +set(CMAKE_CXX_COMPILER "mpiicpc") + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17") + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -ltbb -lsycl") + +add_executable (dpc_reduce src/main.cpp) + +add_custom_target (run + COMMAND dpc_reduce + WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} +) diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md index 7a08d01177..6203feaa09 100644 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md @@ -1,6 +1,7 @@ # dpc_reduce Sample -The dpc_reduce is a simple program that calculates pi. This program is implemented using C++ and Data Parallel C++ (DPC++) for Intel(R) CPU and accelerators. +The dpc_reduce is a simple program that calculates pi. This program is implemented using C++ and Data Parallel C++ (DPC++) for Intel(R) CPU and accelerators. This code sample also demonstrates how to incorporate Data Parallel PC++ into +a MPI program. For comprehensive instructions regarding DPC++ Programming, go to https://software.intel.com/en-us/oneapi-programming-guide and search based on relevant terms noted in the comments. @@ -29,6 +30,12 @@ tiny rectangles and then summing up the results. The parallel computations are performed using oneTBB and oneAPI DPC++ library (oneDPL). +This example also demonstrates how to incorporate Data Parallel PC++ into a MPI program. +Using Data Parallel C++, the code sample runs multiple MPI ranks to distribute the +calculation of the number Pi. Each rank offloads the computation to an accelerator +(GPU/CPU) using Intel DPC++ compiler to compute a partial compution of the number Pi. + + ## Key Implementation Details The basic DPC++ implementation explained in the code includes accessor, kernels, queues, buffers as well as some oneDPL library calls. @@ -46,14 +53,15 @@ If running a sample in the Intel DevCloud, remember that you must specify the co ### On a Linux* System Perform the following steps: -1. Build the program using the following 'cmake' commands +1. Build the program using the following 'cmake' commands +export I_MPI_CXX=dpcpp mkdir build cd build cmake .. make 2. Run the program using: -make run or src/dpc_reduce +make run or './dpc_reduce' or 'mpirun ./dpc_reduce' 3. Clean the program using: make clean @@ -61,16 +69,26 @@ make clean ## Running the Sample ### Application Parameters -There are no editable parameters for this sample. + + Usage: mpirun -n ./dpc_reduce + +where + + : number of MPI rank. + ### Example of Output +Rank #0 runs on: lqnguyen-NUC1, uses device: Intel(R) Gen9 HD Graphics NEO Number of steps is 1000000 -Cpu Seq calc: PI =3.14 in 0.00348 seconds -Cpu TBB calc: PI =3.14 in 0.00178 seconds -dpstd native: PI =3.14 in 0.191 seconds -dpstd native2: PI =3.14 in 0.142 seconds -dpstd native3: PI =3.14 in 0.002 seconds -dpstd native4: PI =3.14 in 0.00234 seconds -dpstd two steps: PI =3.14 in 0.00138 seconds -dpstd transform_reduce: PI =3.14 in 0.000442 seconds -success +Cpu Seq calc: PI =3.14 in 0.00422 seconds +Cpu TBB calc: PI =3.14 in 0.00177 seconds +dpstd native: PI =3.14 in 0.209 seconds +dpstd native2: PI =3.14 in 0.213 seconds +dpstd native3: PI =3.14 in 0.00222 seconds +dpstd native4: PI =3.14 in 0.00237 seconds +dpstd two steps: PI =3.14 in 0.0014 seconds +dpstd transform_reduce: PI =3.14 in 0.000528 seconds +mpi native: PI =3.14 in 0.548 seconds +mpi transform_reduce: PI =3.14 in 0.000498 seconds +succes + diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json index b8c2f8cb72..cbce8a1dfc 100644 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json @@ -19,7 +19,7 @@ "cd build", "cmake ..", "make", - "./src/dpc_reduce" + "./dpc_reduce" ] } ] diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/CMakeLists.txt deleted file mode 100644 index cc3703162b..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -if (NOT CMAKE_CXX_STANDARD) - set(CMAKE_CXX_STANDARD 14) -endif() - -if (NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE RelWithDebInfo) -endif() - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") -set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -ltbb") - -# Add an executable target from source files -add_executable(${PROJECT_NAME} main.cpp) - -if(WIN32) - # Specify libraries to link with - target_link_libraries(${PROJECT_NAME} sycl ) - - # Add custom target for running - add_custom_target(run ${PROJECT_NAME}.exe) -else() - # Add custom target for running - add_custom_target(run ./${PROJECT_NAME}) -endif() diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp index 25cf767a49..dc81bc78e0 100644 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp @@ -3,19 +3,23 @@ // // SPDX-License-Identifier: MIT // ============================================================= +#include + #include #include // setprecision library #include #include #include #include + #include "dpc_common.hpp" -// Many oneAPI code samples share common include files. These -// include files are installed locally with the product installation -// and can be located at %ONEAPI_ROOT%\dev-utilities\latest\include +// Many oneAPI code samples share common include files. These +// include files are installed locally with the product installation +// and can be located at %ONEAPI_ROOT%\dev-utilities\latest\include // on your development system. using namespace sycl; +constexpr int master = 0; // cpu_seq is a simple sequential CPU routine // that calculates all the slices and then @@ -437,83 +441,239 @@ float calc_pi_dpstd_onestep(int num_steps, Policy& policy) { return total; } +//////////////////////////////////////////////////////////////////////// +// +// Each MPI ranks compute the number Pi partially on target device using DPC++. +// The partial result of number Pi is returned in "results". +// +//////////////////////////////////////////////////////////////////////// +void mpi_native(float* results, int rank_num, int num_procs, + long total_num_steps, queue& q) { + int num_step_per_rank = total_num_steps / num_procs; + float dx, dx2; + + dx = 1.0f / (float)total_num_steps; + dx2 = dx / 2.0f; + + default_selector device_selector; + + // exception handler + // + // The exception_list parameter is an iterable list of std::exception_ptr + // objects. But those pointers are not always directly readable. So, we + // rethrow the pointer, catch it, and then we have the exception itself. + // Note: depending upon the operation there may be several exceptions. + auto exception_handler = [&](exception_list exceptionList) { + for (std::exception_ptr const& e : exceptionList) { + try { + std::rethrow_exception(e); + } catch (cl::sycl::exception const& e) { + std::cout << "Failure" + << "\n"; + std::terminate(); + } + } + }; + + try { + // The size of amount of memory that will be given to the buffer. + range<1> num_items{total_num_steps / size_t(num_procs)}; + + // Buffers are used to tell SYCL which data will be shared between the host + // and the devices. + buffer results_buf(results, + range<1>(total_num_steps / size_t(num_procs))); + + // Submit takes in a lambda that is passed in a command group handler + // constructed at runtime. + q.submit([&](handler& h) { + // Accessors are used to get access to the memory owned by the buffers. + auto results_accessor = results_buf.get_access(h); + // Each kernel calculates a partial of the number Pi in parallel. + h.parallel_for(num_items, [=](id<1> k) { + float x = ((float)rank_num / (float)num_procs) + (float)k * dx + dx2; + results_accessor[k] = (4.0f * dx) / (1.0f + x * x); + }); + }); + } catch (...) { + std::cout << "Failure" << std::endl; + } +} + +// This function uses the DPC++ library call transform reduce. +// It does everything in one library call. +template +float mpi_dpstd_onestep(int id, int num_procs, long total_num_steps, + Policy& policy) { + int num_step_per_rank = total_num_steps / num_procs; + float step = 1.0f / (float)total_num_steps; + + float total = std::transform_reduce( + policy, oneapi::dpl::counting_iterator(1), + oneapi::dpl::counting_iterator(num_step_per_rank), 0.0f, + std::plus(), [=](int i) { + float x = ((float)id / (float)num_procs) + i * step - step / 2; + return (4.0f / (1.0f + x * x)); + }); + total = total * (float)step; + + return total; +} + int main(int argc, char** argv) { int num_steps = 1000000; - printf("Number of steps is %d\n", num_steps); int groups = 10000; - + char machine_name[MPI_MAX_PROCESSOR_NAME]; + int name_len; + int id; + int num_procs; float pi; queue myQueue{property::queue::in_order()}; auto policy = oneapi::dpl::execution::make_device_policy( queue(default_selector{}, dpc_common::exception_handler)); - // Since we are using JIT compiler for samples, - // we need to run each step once to allow for compile - // to occur before we time execution of function. - pi = calc_pi_dpstd_native(num_steps, policy); - pi = calc_pi_dpstd_native2(num_steps, policy, groups); - pi = calc_pi_dpstd_native3(num_steps, groups, policy); - pi = calc_pi_dpstd_native4(num_steps, groups, policy); - - pi = calc_pi_dpstd_two_steps_lib(num_steps, policy); - pi = calc_pi_dpstd_onestep(num_steps, policy); - - dpc_common::TimeInterval T; - pi = calc_pi_cpu_seq(num_steps); - auto stop = T.Elapsed(); - std::cout << "Cpu Seq calc: \t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop << " seconds\n"; - - dpc_common::TimeInterval T2; - pi = calc_pi_cpu_tbb(num_steps); - auto stop2 = T2.Elapsed(); - std::cout << "Cpu TBB calc: \t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop2 << " seconds\n"; - - dpc_common::TimeInterval T3; - pi = calc_pi_dpstd_native(num_steps, policy); - auto stop3 = T3.Elapsed(); - std::cout << "dpstd native:\t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop3 << " seconds\n"; - - dpc_common::TimeInterval T3a; - pi = calc_pi_dpstd_native2(num_steps, policy, groups); - auto stop3a = T3a.Elapsed(); - std::cout << "dpstd native2:\t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop3a << " seconds\n"; - - dpc_common::TimeInterval T3b; - pi = calc_pi_dpstd_native3(num_steps, groups, policy); - auto stop3b = T3b.Elapsed(); - std::cout << "dpstd native3:\t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop3b << " seconds\n"; - - dpc_common::TimeInterval T3c; - pi = calc_pi_dpstd_native4(num_steps, groups, policy); - auto stop3c = T3c.Elapsed(); - std::cout << "dpstd native4:\t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop3c << " seconds\n"; - - dpc_common::TimeInterval T4; - pi = calc_pi_dpstd_two_steps_lib(num_steps, policy); - auto stop4 = T4.Elapsed(); - std::cout << "dpstd two steps:\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop4 << " seconds\n"; - - dpc_common::TimeInterval T5; - pi = calc_pi_dpstd_onestep(num_steps, policy); - auto stop5 = T5.Elapsed(); - std::cout << "dpstd transform_reduce: "; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop5 << " seconds\n"; - - std::cout << "success\n"; + // Start MPI. + if (MPI_Init(&argc, &argv) != MPI_SUCCESS) { + std::cout << "Failed to initialize MPI\n"; + exit(-1); + } + + // Create the communicator, and retrieve the number of MPI ranks. + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + // Determine the rank number. + MPI_Comm_rank(MPI_COMM_WORLD, &id); + + // Get the machine name. + MPI_Get_processor_name(machine_name, &name_len); + + std::cout << "Rank #" << id << " runs on: " << machine_name + << ", uses device: " + << myQueue.get_device().get_info() << "\n"; + + if (id == master) { + printf("Number of steps is %d\n", num_steps); + + // Since we are using JIT compiler for samples, + // we need to run each step once to allow for compile + // to occur before we time execution of function. + pi = calc_pi_dpstd_native(num_steps, policy); + pi = calc_pi_dpstd_native2(num_steps, policy, groups); + pi = calc_pi_dpstd_native3(num_steps, groups, policy); + pi = calc_pi_dpstd_native4(num_steps, groups, policy); + + pi = calc_pi_dpstd_two_steps_lib(num_steps, policy); + pi = calc_pi_dpstd_onestep(num_steps, policy); + + dpc_common::TimeInterval T; + pi = calc_pi_cpu_seq(num_steps); + auto stop = T.Elapsed(); + std::cout << "Cpu Seq calc: \t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop << " seconds\n"; + + dpc_common::TimeInterval T2; + pi = calc_pi_cpu_tbb(num_steps); + auto stop2 = T2.Elapsed(); + std::cout << "Cpu TBB calc: \t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop2 << " seconds\n"; + + dpc_common::TimeInterval T3; + pi = calc_pi_dpstd_native(num_steps, policy); + auto stop3 = T3.Elapsed(); + std::cout << "dpstd native:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3 << " seconds\n"; + + dpc_common::TimeInterval T3a; + pi = calc_pi_dpstd_native2(num_steps, policy, groups); + auto stop3a = T3a.Elapsed(); + std::cout << "dpstd native2:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3a << " seconds\n"; + + dpc_common::TimeInterval T3b; + pi = calc_pi_dpstd_native3(num_steps, groups, policy); + auto stop3b = T3b.Elapsed(); + std::cout << "dpstd native3:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3b << " seconds\n"; + + dpc_common::TimeInterval T3c; + pi = calc_pi_dpstd_native4(num_steps, groups, policy); + auto stop3c = T3c.Elapsed(); + std::cout << "dpstd native4:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3c << " seconds\n"; + + dpc_common::TimeInterval T4; + pi = calc_pi_dpstd_two_steps_lib(num_steps, policy); + auto stop4 = T4.Elapsed(); + std::cout << "dpstd two steps:\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop4 << " seconds\n"; + + dpc_common::TimeInterval T5; + pi = calc_pi_dpstd_onestep(num_steps, policy); + auto stop5 = T5.Elapsed(); + std::cout << "dpstd transform_reduce: "; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop5 << " seconds\n"; + } + + int num_step_per_rank = num_steps / num_procs; + float* results_per_rank = new float[num_step_per_rank]; + + // Initialize an array to store a partial result per rank. + for (size_t i = 0; i < num_step_per_rank; i++) results_per_rank[i] = 0.0; + + dpc_common::TimeInterval T6; + // Calculate the Pi number partially by multiple MPI ranks. + mpi_native(results_per_rank, id, num_procs, num_steps, myQueue); + + float local_sum = 0.0; + + // Use the DPC++ library call to reduce the array using plus + buffer calc_values(results_per_rank, num_step_per_rank); + auto calc_begin2 = dpstd::begin(calc_values); + auto calc_end2 = dpstd::end(calc_values); + + local_sum = + std::reduce(policy, calc_begin2, calc_end2, 0.0f, std::plus()); + policy.queue().wait(); + + // Master rank performs a reduce operation to get the sum of all partial Pi. + MPI_Reduce(&local_sum, &pi, 1, MPI_FLOAT, MPI_SUM, master, MPI_COMM_WORLD); + + if (id == master) { + auto stop6 = T6.Elapsed(); + + std::cout << "mpi native:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop6 << " seconds\n"; + } + + delete[] results_per_rank; + + // mpi_dpstd_onestep + dpc_common::TimeInterval T7; + local_sum = mpi_dpstd_onestep(id, num_procs, num_steps, policy); + auto stop7 = T7.Elapsed(); + + // Master rank performs a reduce operation to get the sum of all partial Pi. + MPI_Reduce(&local_sum, &pi, 1, MPI_FLOAT, MPI_SUM, master, MPI_COMM_WORLD); + + if (id == master) { + auto stop6 = T7.Elapsed(); + + std::cout << "mpi transform_reduce:\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop7 << " seconds\n"; + std::cout << "succes\n"; + } + + MPI_Finalize(); + return 0; } From 9cf98ad97599cd875abe92af2ed8f5b9c1fa2512 Mon Sep 17 00:00:00 2001 From: JoeOster <52936608+JoeOster@users.noreply.github.com> Date: Fri, 21 Aug 2020 09:26:25 -0700 Subject: [PATCH 11/23] Update README.md --- DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md index 6203feaa09..212b4011ce 100644 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md @@ -6,7 +6,8 @@ a MPI program. For comprehensive instructions regarding DPC++ Programming, go to https://software.intel.com/en-us/oneapi-programming-guide and search based on relevant terms noted in the comments. -| Optimized for | Description +| Optimized for | Description +|:--- |:--- | OS | Linux* Ubuntu* 18.04, | Hardware | Skylake with GEN9 or newer, | Software | Intel® oneAPI DPC++ Compiler (beta) From 6a8a52166bfc80e2440a6719d0f166a5a5aeb2f2 Mon Sep 17 00:00:00 2001 From: JoeOster <52936608+JoeOster@users.noreply.github.com> Date: Fri, 21 Aug 2020 09:29:29 -0700 Subject: [PATCH 12/23] Update main.cpp --- .../DPC++/ParallelPatterns/dpc_reduce/src/main.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp index dc81bc78e0..42c1b2b668 100644 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp @@ -12,11 +12,9 @@ #include #include +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp #include "dpc_common.hpp" -// Many oneAPI code samples share common include files. These -// include files are installed locally with the product installation -// and can be located at %ONEAPI_ROOT%\dev-utilities\latest\include -// on your development system. using namespace sycl; constexpr int master = 0; From 7bb963142ea705710fb8fcecb490455f36178fb1 Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Thu, 27 Aug 2020 12:23:14 -0700 Subject: [PATCH 13/23] Integrate MPI with latest dpc_reduce for beta09. --- .../ParallelPatterns/dpc_reduce/src/main.cpp | 135 +++++++++++++++--- 1 file changed, 117 insertions(+), 18 deletions(-) diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp index 42c1b2b668..6d154fc3f8 100644 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp @@ -8,6 +8,9 @@ #include #include // setprecision library #include + +// The include folder is located at %ONEAPI_ROOT%\dev-utilities\latest\include +// on your development system. #include #include #include @@ -164,6 +167,108 @@ struct slice_area { }; }; +// a way to get value_type from both accessors and USM that is needed for +// transform_init +template +struct accessor_traits {}; + +template +struct accessor_traits< + sycl::accessor> { + using value_type = typename sycl::accessor::value_type; +}; + +template +struct accessor_traits { + using value_type = RawArrayValueType; +}; + +// calculate shift where we should start processing on current item +template +SizeN calc_shift(const NDItemId item_id, const GlobalIdx global_idx, + SizeNIter& n_iter, const SizeN n) { + auto global_range_size = item_id.get_global_range().size(); + + auto start = n_iter * global_idx; + auto global_shift = global_idx + n_iter * global_range_size; + if (n_iter > 0 && global_shift > n) { + start += n % global_range_size - global_idx; + } else if (global_shift < n) { + n_iter++; + } + return start; +} + +template +struct transform_init { + Operation1 binary_op; + Operation2 unary_op; + + template + void operator()(const NDItemId item_id, const GlobalIdx global_idx, Size n, + AccLocal& local_mem, const Acc&... acc) { + auto local_idx = item_id.get_local_id(0); + auto global_range_size = item_id.get_global_range().size(); + auto n_iter = n / global_range_size; + auto start = calc_shift(item_id, global_idx, n_iter, n); + auto shifted_global_idx = global_idx + start; + + typename accessor_traits::value_type res; + if (global_idx < n) { + res = unary_op(shifted_global_idx, acc...); + } + // Add neighbour to the current local_mem + for (decltype(n_iter) i = 1; i < n_iter; ++i) { + res = binary_op(res, unary_op(shifted_global_idx + i, acc...)); + } + if (global_idx < n) { + local_mem[local_idx] = res; + } + } +}; + +// Reduce on local memory +template +struct reduce { + BinaryOperation1 bin_op1; + + template + Tp operator()(const NDItemId item_id, const GlobalIdx global_idx, + const Size n, AccLocal& local_mem) { + auto local_idx = item_id.get_local_id(0); + auto group_size = item_id.get_local_range().size(); + + auto k = 1; + do { + item_id.barrier(sycl::access::fence_space::local_space); + if (local_idx % (2 * k) == 0 && local_idx + k < group_size && + global_idx < n && global_idx + k < n) { + local_mem[local_idx] = + bin_op1(local_mem[local_idx], local_mem[local_idx + k]); + } + k *= 2; + } while (k < group_size); + return local_mem[local_idx]; + } +}; + +// walk through the data +template +struct walk_n { + F f; + + template + auto operator()(const ItemId idx, Ranges&&... rngs) + -> decltype(f(rngs[idx]...)) { + return f(rngs[idx]...); + } +}; + // This option uses a parallel for to fill the buffer and then // uses a tranform_init with plus/no_op and then // a local reduction then global reduction. @@ -189,22 +294,19 @@ float calc_pi_dpstd_native3(size_t num_steps, int groups, Policy&& policy) { auto calc_begin = oneapi::dpl::begin(buf); auto calc_end = oneapi::dpl::end(buf); - using Functor = oneapi::dpl::unseq_backend::walk_n; + using Functor = walk_n; float result; // Functor will do nothing for tranform_init and will use plus for reduce. // In this example we have done the calculation and filled the buffer above // The way transform_init works is that you need to have the value already // populated in the buffer. - auto tf_init = - oneapi::dpl::unseq_backend::transform_init, - Functor>{std::plus(), - Functor{my_no_op()}}; + auto tf_init = transform_init, Functor>{ + std::plus(), Functor{my_no_op()}}; auto combine = std::plus(); auto brick_reduce = - oneapi::dpl::unseq_backend::reduce, float>{ - std::plus()}; + reduce, float>{std::plus()}; auto workgroup_size = policy.queue() .get_device() @@ -234,8 +336,8 @@ float calc_pi_dpstd_native3(size_t num_steps, int groups, Policy&& policy) { [=](nd_item<1> item_id) mutable { auto global_idx = item_id.get_global_id(0); // 1. Initialization (transform part). - tf_init(item_id, global_idx, access_buf, num_steps, - temp_buf_local); + tf_init(item_id, global_idx, num_steps, temp_buf_local, + access_buf); // 2. Reduce within work group float local_result = brick_reduce( item_id, global_idx, num_steps, temp_buf_local); @@ -295,20 +397,17 @@ float calc_pi_dpstd_native4(size_t num_steps, int groups, Policy&& policy) { auto calc_begin = oneapi::dpl::begin(buf2); auto calc_end = oneapi::dpl::end(buf2); - using Functor2 = oneapi::dpl::unseq_backend::walk_n; + using Functor2 = walk_n; // The buffer has 1...num it at and now we will use that as an input // to the slice structue which will calculate the area of each // rectangle. - auto tf_init = - oneapi::dpl::unseq_backend::transform_init, - Functor2>{ - std::plus(), Functor2{slice_area(num_steps)}}; + auto tf_init = transform_init, Functor2>{ + std::plus(), Functor2{slice_area(num_steps)}}; auto combine = std::plus(); auto brick_reduce = - oneapi::dpl::unseq_backend::reduce, float>{ - std::plus()}; + reduce, float>{std::plus()}; // get workgroup_size from the device auto workgroup_size = @@ -347,8 +446,8 @@ float calc_pi_dpstd_native4(size_t num_steps, int groups, Policy&& policy) { auto global_idx = item_id.get_global_id(0); // 1. Initialization (transform part). Fill local // memory - tf_init(item_id, global_idx, access_buf, num_steps, - temp_buf_local); + tf_init(item_id, global_idx, num_steps, temp_buf_local, + access_buf); // 2. Reduce within work group float local_result = brick_reduce( item_id, global_idx, num_steps, temp_buf_local); From 116ec934b9f66e49e11f9c5915608040bafb2f61 Mon Sep 17 00:00:00 2001 From: JoeOster <52936608+JoeOster@users.noreply.github.com> Date: Thu, 27 Aug 2020 13:44:19 -0700 Subject: [PATCH 14/23] Update README.md --- .../ParallelPatterns/dpc_reduce/README.md | 50 ++++++++----------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md index 212b4011ce..1d0f4b9f80 100644 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md @@ -1,13 +1,13 @@ # dpc_reduce Sample -The dpc_reduce is a simple program that calculates pi. This program is implemented using C++ and Data Parallel C++ (DPC++) for Intel(R) CPU and accelerators. This code sample also demonstrates how to incorporate Data Parallel PC++ into -a MPI program. +The dpc_reduce is a simple program that calculates pi. This program is implemented using C++ and Data Parallel C++ (DPC++) for Intel(R) CPU and accelerators. For comprehensive instructions regarding DPC++ Programming, go to https://software.intel.com/en-us/oneapi-programming-guide and search based on relevant terms noted in the comments. + | Optimized for | Description -|:--- |:--- +|:--- |:--- | OS | Linux* Ubuntu* 18.04, | Hardware | Skylake with GEN9 or newer, | Software | Intel® oneAPI DPC++ Compiler (beta) @@ -31,12 +31,6 @@ tiny rectangles and then summing up the results. The parallel computations are performed using oneTBB and oneAPI DPC++ library (oneDPL). -This example also demonstrates how to incorporate Data Parallel PC++ into a MPI program. -Using Data Parallel C++, the code sample runs multiple MPI ranks to distribute the -calculation of the number Pi. Each rank offloads the computation to an accelerator -(GPU/CPU) using Intel DPC++ compiler to compute a partial compution of the number Pi. - - ## Key Implementation Details The basic DPC++ implementation explained in the code includes accessor, kernels, queues, buffers as well as some oneDPL library calls. @@ -54,15 +48,14 @@ If running a sample in the Intel DevCloud, remember that you must specify the co ### On a Linux* System Perform the following steps: -1. Build the program using the following 'cmake' commands -export I_MPI_CXX=dpcpp +1. Build the program using the following 'cmake' commands mkdir build cd build cmake .. make 2. Run the program using: -make run or './dpc_reduce' or 'mpirun ./dpc_reduce' +make run or src/dpc_reduce 3. Clean the program using: make clean @@ -70,26 +63,25 @@ make clean ## Running the Sample ### Application Parameters +There are no editable parameters for this sample. - Usage: mpirun -n ./dpc_reduce +### Example of Output +Number of steps is 1000000 -where +Cpu Seq calc: PI =3.14 in 0.00348 seconds - : number of MPI rank. +Cpu TBB calc: PI =3.14 in 0.00178 seconds +dpstd native: PI =3.14 in 0.191 seconds -### Example of Output -Rank #0 runs on: lqnguyen-NUC1, uses device: Intel(R) Gen9 HD Graphics NEO -Number of steps is 1000000 -Cpu Seq calc: PI =3.14 in 0.00422 seconds -Cpu TBB calc: PI =3.14 in 0.00177 seconds -dpstd native: PI =3.14 in 0.209 seconds -dpstd native2: PI =3.14 in 0.213 seconds -dpstd native3: PI =3.14 in 0.00222 seconds -dpstd native4: PI =3.14 in 0.00237 seconds -dpstd two steps: PI =3.14 in 0.0014 seconds -dpstd transform_reduce: PI =3.14 in 0.000528 seconds -mpi native: PI =3.14 in 0.548 seconds -mpi transform_reduce: PI =3.14 in 0.000498 seconds -succes +dpstd native2: PI =3.14 in 0.142 seconds + +dpstd native3: PI =3.14 in 0.002 seconds + +dpstd native4: PI =3.14 in 0.00234 seconds + +dpstd two steps: PI =3.14 in 0.00138 seconds + +dpstd transform_reduce: PI =3.14 in 0.000442 seconds +success From 99ae832101389ffffbc910ab1dffe9e32a8f3e3c Mon Sep 17 00:00:00 2001 From: JoeOster <52936608+JoeOster@users.noreply.github.com> Date: Thu, 27 Aug 2020 13:47:48 -0700 Subject: [PATCH 15/23] Update main.cpp --- .../ParallelPatterns/dpc_reduce/src/main.cpp | 506 +++++++----------- 1 file changed, 183 insertions(+), 323 deletions(-) diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp index 6d154fc3f8..e1fea5d761 100644 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp @@ -3,24 +3,21 @@ // // SPDX-License-Identifier: MIT // ============================================================= -#include - #include #include // setprecision library #include -// The include folder is located at %ONEAPI_ROOT%\dev-utilities\latest\include +// The include folder is located at %ONEAPI_ROOT%\dev-utilities\latest\include // on your development system. #include #include #include // dpc_common.hpp can be found in the dev-utilities include folder. -// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp #include "dpc_common.hpp" using namespace sycl; -constexpr int master = 0; // cpu_seq is a simple sequential CPU routine // that calculates all the slices and then @@ -167,108 +164,126 @@ struct slice_area { }; }; -// a way to get value_type from both accessors and USM that is needed for -// transform_init + +// a way to get value_type from both accessors and USM that is needed for transform_init template -struct accessor_traits {}; - -template -struct accessor_traits< - sycl::accessor> { - using value_type = typename sycl::accessor::value_type; +struct accessor_traits +{ +}; + +template +struct accessor_traits> +{ + using value_type = typename sycl::accessor::value_type; }; template -struct accessor_traits { - using value_type = RawArrayValueType; +struct accessor_traits +{ + using value_type = RawArrayValueType; }; // calculate shift where we should start processing on current item -template -SizeN calc_shift(const NDItemId item_id, const GlobalIdx global_idx, - SizeNIter& n_iter, const SizeN n) { - auto global_range_size = item_id.get_global_range().size(); - - auto start = n_iter * global_idx; - auto global_shift = global_idx + n_iter * global_range_size; - if (n_iter > 0 && global_shift > n) { - start += n % global_range_size - global_idx; - } else if (global_shift < n) { - n_iter++; - } - return start; -} - -template -struct transform_init { - Operation1 binary_op; - Operation2 unary_op; - - template - void operator()(const NDItemId item_id, const GlobalIdx global_idx, Size n, - AccLocal& local_mem, const Acc&... acc) { - auto local_idx = item_id.get_local_id(0); +template +SizeN +calc_shift(const NDItemId item_id, const GlobalIdx global_idx, SizeNIter& n_iter, const SizeN n) +{ auto global_range_size = item_id.get_global_range().size(); - auto n_iter = n / global_range_size; - auto start = calc_shift(item_id, global_idx, n_iter, n); - auto shifted_global_idx = global_idx + start; - typename accessor_traits::value_type res; - if (global_idx < n) { - res = unary_op(shifted_global_idx, acc...); + auto start = n_iter * global_idx; + auto global_shift = global_idx + n_iter * global_range_size; + if (n_iter > 0 && global_shift > n) + { + start += n % global_range_size - global_idx; } - // Add neighbour to the current local_mem - for (decltype(n_iter) i = 1; i < n_iter; ++i) { - res = binary_op(res, unary_op(shifted_global_idx + i, acc...)); + else if (global_shift < n) + { + n_iter++; } - if (global_idx < n) { - local_mem[local_idx] = res; + return start; +} + + +template +struct transform_init +{ + Operation1 binary_op; + Operation2 unary_op; + + template + void + operator()(const NDItemId item_id, const GlobalIdx global_idx, Size n, AccLocal& local_mem, + const Acc&... acc) + { + auto local_idx = item_id.get_local_id(0); + auto global_range_size = item_id.get_global_range().size(); + auto n_iter = n / global_range_size; + auto start = calc_shift(item_id, global_idx, n_iter, n); + auto shifted_global_idx = global_idx + start; + + typename accessor_traits::value_type res; + if (global_idx < n) + { + res = unary_op(shifted_global_idx, acc...); + } + // Add neighbour to the current local_mem + for (decltype(n_iter) i = 1; i < n_iter; ++i) + { + res = binary_op(res, unary_op(shifted_global_idx + i, acc...)); + } + if (global_idx < n) + { + local_mem[local_idx] = res; + } } - } }; + // Reduce on local memory template -struct reduce { - BinaryOperation1 bin_op1; - - template - Tp operator()(const NDItemId item_id, const GlobalIdx global_idx, - const Size n, AccLocal& local_mem) { - auto local_idx = item_id.get_local_id(0); - auto group_size = item_id.get_local_range().size(); - - auto k = 1; - do { - item_id.barrier(sycl::access::fence_space::local_space); - if (local_idx % (2 * k) == 0 && local_idx + k < group_size && - global_idx < n && global_idx + k < n) { - local_mem[local_idx] = - bin_op1(local_mem[local_idx], local_mem[local_idx + k]); - } - k *= 2; - } while (k < group_size); - return local_mem[local_idx]; - } +struct reduce +{ + BinaryOperation1 bin_op1; + + template + Tp + operator()(const NDItemId item_id, const GlobalIdx global_idx, const Size n, AccLocal& local_mem) + { + auto local_idx = item_id.get_local_id(0); + auto group_size = item_id.get_local_range().size(); + + auto k = 1; + do + { + item_id.barrier(sycl::access::fence_space::local_space); + if (local_idx % (2 * k) == 0 && local_idx + k < group_size && global_idx < n && + global_idx + k < n) + { + local_mem[local_idx] = bin_op1(local_mem[local_idx], local_mem[local_idx + k]); + } + k *= 2; + } while (k < group_size); + return local_mem[local_idx]; + } }; + // walk through the data template -struct walk_n { - F f; - - template - auto operator()(const ItemId idx, Ranges&&... rngs) - -> decltype(f(rngs[idx]...)) { - return f(rngs[idx]...); - } +struct walk_n +{ + F f; + + template + auto + operator()(const ItemId idx, Ranges&&... rngs) -> decltype(f(rngs[idx]...)) + { + return f(rngs[idx]...); + } }; + // This option uses a parallel for to fill the buffer and then // uses a tranform_init with plus/no_op and then // a local reduction then global reduction. @@ -301,12 +316,12 @@ float calc_pi_dpstd_native3(size_t num_steps, int groups, Policy&& policy) { // In this example we have done the calculation and filled the buffer above // The way transform_init works is that you need to have the value already // populated in the buffer. - auto tf_init = transform_init, Functor>{ - std::plus(), Functor{my_no_op()}}; + auto tf_init = transform_init, + Functor>{std::plus(), Functor{my_no_op()}}; auto combine = std::plus(); - auto brick_reduce = - reduce, float>{std::plus()}; + auto brick_reduce = reduce, float>{ + std::plus()}; auto workgroup_size = policy.queue() .get_device() @@ -336,8 +351,8 @@ float calc_pi_dpstd_native3(size_t num_steps, int groups, Policy&& policy) { [=](nd_item<1> item_id) mutable { auto global_idx = item_id.get_global_id(0); // 1. Initialization (transform part). - tf_init(item_id, global_idx, num_steps, temp_buf_local, - access_buf); + tf_init(item_id, global_idx, num_steps, + temp_buf_local, access_buf); // 2. Reduce within work group float local_result = brick_reduce( item_id, global_idx, num_steps, temp_buf_local); @@ -402,12 +417,13 @@ float calc_pi_dpstd_native4(size_t num_steps, int groups, Policy&& policy) { // The buffer has 1...num it at and now we will use that as an input // to the slice structue which will calculate the area of each // rectangle. - auto tf_init = transform_init, Functor2>{ - std::plus(), Functor2{slice_area(num_steps)}}; + auto tf_init = transform_init, + Functor2>{ + std::plus(), Functor2{slice_area(num_steps)}}; auto combine = std::plus(); - auto brick_reduce = - reduce, float>{std::plus()}; + auto brick_reduce = reduce, float>{ + std::plus()}; // get workgroup_size from the device auto workgroup_size = @@ -446,8 +462,8 @@ float calc_pi_dpstd_native4(size_t num_steps, int groups, Policy&& policy) { auto global_idx = item_id.get_global_id(0); // 1. Initialization (transform part). Fill local // memory - tf_init(item_id, global_idx, num_steps, temp_buf_local, - access_buf); + tf_init(item_id, global_idx, num_steps, + temp_buf_local, access_buf); // 2. Reduce within work group float local_result = brick_reduce( item_id, global_idx, num_steps, temp_buf_local); @@ -538,239 +554,83 @@ float calc_pi_dpstd_onestep(int num_steps, Policy& policy) { return total; } -//////////////////////////////////////////////////////////////////////// -// -// Each MPI ranks compute the number Pi partially on target device using DPC++. -// The partial result of number Pi is returned in "results". -// -//////////////////////////////////////////////////////////////////////// -void mpi_native(float* results, int rank_num, int num_procs, - long total_num_steps, queue& q) { - int num_step_per_rank = total_num_steps / num_procs; - float dx, dx2; - - dx = 1.0f / (float)total_num_steps; - dx2 = dx / 2.0f; - - default_selector device_selector; - - // exception handler - // - // The exception_list parameter is an iterable list of std::exception_ptr - // objects. But those pointers are not always directly readable. So, we - // rethrow the pointer, catch it, and then we have the exception itself. - // Note: depending upon the operation there may be several exceptions. - auto exception_handler = [&](exception_list exceptionList) { - for (std::exception_ptr const& e : exceptionList) { - try { - std::rethrow_exception(e); - } catch (cl::sycl::exception const& e) { - std::cout << "Failure" - << "\n"; - std::terminate(); - } - } - }; - - try { - // The size of amount of memory that will be given to the buffer. - range<1> num_items{total_num_steps / size_t(num_procs)}; - - // Buffers are used to tell SYCL which data will be shared between the host - // and the devices. - buffer results_buf(results, - range<1>(total_num_steps / size_t(num_procs))); - - // Submit takes in a lambda that is passed in a command group handler - // constructed at runtime. - q.submit([&](handler& h) { - // Accessors are used to get access to the memory owned by the buffers. - auto results_accessor = results_buf.get_access(h); - // Each kernel calculates a partial of the number Pi in parallel. - h.parallel_for(num_items, [=](id<1> k) { - float x = ((float)rank_num / (float)num_procs) + (float)k * dx + dx2; - results_accessor[k] = (4.0f * dx) / (1.0f + x * x); - }); - }); - } catch (...) { - std::cout << "Failure" << std::endl; - } -} - -// This function uses the DPC++ library call transform reduce. -// It does everything in one library call. -template -float mpi_dpstd_onestep(int id, int num_procs, long total_num_steps, - Policy& policy) { - int num_step_per_rank = total_num_steps / num_procs; - float step = 1.0f / (float)total_num_steps; - - float total = std::transform_reduce( - policy, oneapi::dpl::counting_iterator(1), - oneapi::dpl::counting_iterator(num_step_per_rank), 0.0f, - std::plus(), [=](int i) { - float x = ((float)id / (float)num_procs) + i * step - step / 2; - return (4.0f / (1.0f + x * x)); - }); - total = total * (float)step; - - return total; -} - int main(int argc, char** argv) { int num_steps = 1000000; + printf("Number of steps is %d\n", num_steps); int groups = 10000; - char machine_name[MPI_MAX_PROCESSOR_NAME]; - int name_len; - int id; - int num_procs; + float pi; queue myQueue{property::queue::in_order()}; auto policy = oneapi::dpl::execution::make_device_policy( queue(default_selector{}, dpc_common::exception_handler)); - // Start MPI. - if (MPI_Init(&argc, &argv) != MPI_SUCCESS) { - std::cout << "Failed to initialize MPI\n"; - exit(-1); - } - - // Create the communicator, and retrieve the number of MPI ranks. - MPI_Comm_size(MPI_COMM_WORLD, &num_procs); - - // Determine the rank number. - MPI_Comm_rank(MPI_COMM_WORLD, &id); - - // Get the machine name. - MPI_Get_processor_name(machine_name, &name_len); - - std::cout << "Rank #" << id << " runs on: " << machine_name - << ", uses device: " - << myQueue.get_device().get_info() << "\n"; - - if (id == master) { - printf("Number of steps is %d\n", num_steps); - - // Since we are using JIT compiler for samples, - // we need to run each step once to allow for compile - // to occur before we time execution of function. - pi = calc_pi_dpstd_native(num_steps, policy); - pi = calc_pi_dpstd_native2(num_steps, policy, groups); - pi = calc_pi_dpstd_native3(num_steps, groups, policy); - pi = calc_pi_dpstd_native4(num_steps, groups, policy); - - pi = calc_pi_dpstd_two_steps_lib(num_steps, policy); - pi = calc_pi_dpstd_onestep(num_steps, policy); - - dpc_common::TimeInterval T; - pi = calc_pi_cpu_seq(num_steps); - auto stop = T.Elapsed(); - std::cout << "Cpu Seq calc: \t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop << " seconds\n"; - - dpc_common::TimeInterval T2; - pi = calc_pi_cpu_tbb(num_steps); - auto stop2 = T2.Elapsed(); - std::cout << "Cpu TBB calc: \t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop2 << " seconds\n"; - - dpc_common::TimeInterval T3; - pi = calc_pi_dpstd_native(num_steps, policy); - auto stop3 = T3.Elapsed(); - std::cout << "dpstd native:\t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop3 << " seconds\n"; - - dpc_common::TimeInterval T3a; - pi = calc_pi_dpstd_native2(num_steps, policy, groups); - auto stop3a = T3a.Elapsed(); - std::cout << "dpstd native2:\t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop3a << " seconds\n"; - - dpc_common::TimeInterval T3b; - pi = calc_pi_dpstd_native3(num_steps, groups, policy); - auto stop3b = T3b.Elapsed(); - std::cout << "dpstd native3:\t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop3b << " seconds\n"; - - dpc_common::TimeInterval T3c; - pi = calc_pi_dpstd_native4(num_steps, groups, policy); - auto stop3c = T3c.Elapsed(); - std::cout << "dpstd native4:\t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop3c << " seconds\n"; - - dpc_common::TimeInterval T4; - pi = calc_pi_dpstd_two_steps_lib(num_steps, policy); - auto stop4 = T4.Elapsed(); - std::cout << "dpstd two steps:\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop4 << " seconds\n"; - - dpc_common::TimeInterval T5; - pi = calc_pi_dpstd_onestep(num_steps, policy); - auto stop5 = T5.Elapsed(); - std::cout << "dpstd transform_reduce: "; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop5 << " seconds\n"; - } - - int num_step_per_rank = num_steps / num_procs; - float* results_per_rank = new float[num_step_per_rank]; - - // Initialize an array to store a partial result per rank. - for (size_t i = 0; i < num_step_per_rank; i++) results_per_rank[i] = 0.0; - - dpc_common::TimeInterval T6; - // Calculate the Pi number partially by multiple MPI ranks. - mpi_native(results_per_rank, id, num_procs, num_steps, myQueue); - - float local_sum = 0.0; - - // Use the DPC++ library call to reduce the array using plus - buffer calc_values(results_per_rank, num_step_per_rank); - auto calc_begin2 = dpstd::begin(calc_values); - auto calc_end2 = dpstd::end(calc_values); - - local_sum = - std::reduce(policy, calc_begin2, calc_end2, 0.0f, std::plus()); - policy.queue().wait(); - - // Master rank performs a reduce operation to get the sum of all partial Pi. - MPI_Reduce(&local_sum, &pi, 1, MPI_FLOAT, MPI_SUM, master, MPI_COMM_WORLD); - - if (id == master) { - auto stop6 = T6.Elapsed(); - - std::cout << "mpi native:\t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop6 << " seconds\n"; - } - - delete[] results_per_rank; - - // mpi_dpstd_onestep - dpc_common::TimeInterval T7; - local_sum = mpi_dpstd_onestep(id, num_procs, num_steps, policy); - auto stop7 = T7.Elapsed(); - - // Master rank performs a reduce operation to get the sum of all partial Pi. - MPI_Reduce(&local_sum, &pi, 1, MPI_FLOAT, MPI_SUM, master, MPI_COMM_WORLD); - - if (id == master) { - auto stop6 = T7.Elapsed(); - - std::cout << "mpi transform_reduce:\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop7 << " seconds\n"; - std::cout << "succes\n"; - } - - MPI_Finalize(); - + // Since we are using JIT compiler for samples, + // we need to run each step once to allow for compile + // to occur before we time execution of function. + pi = calc_pi_dpstd_native(num_steps, policy); + pi = calc_pi_dpstd_native2(num_steps, policy, groups); + pi = calc_pi_dpstd_native3(num_steps, groups, policy); + pi = calc_pi_dpstd_native4(num_steps, groups, policy); + + pi = calc_pi_dpstd_two_steps_lib(num_steps, policy); + pi = calc_pi_dpstd_onestep(num_steps, policy); + + dpc_common::TimeInterval T; + pi = calc_pi_cpu_seq(num_steps); + auto stop = T.Elapsed(); + std::cout << "Cpu Seq calc: \t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop << " seconds\n"; + + dpc_common::TimeInterval T2; + pi = calc_pi_cpu_tbb(num_steps); + auto stop2 = T2.Elapsed(); + std::cout << "Cpu TBB calc: \t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop2 << " seconds\n"; + + dpc_common::TimeInterval T3; + pi = calc_pi_dpstd_native(num_steps, policy); + auto stop3 = T3.Elapsed(); + std::cout << "dpstd native:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3 << " seconds\n"; + + dpc_common::TimeInterval T3a; + pi = calc_pi_dpstd_native2(num_steps, policy, groups); + auto stop3a = T3a.Elapsed(); + std::cout << "dpstd native2:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3a << " seconds\n"; + + dpc_common::TimeInterval T3b; + pi = calc_pi_dpstd_native3(num_steps, groups, policy); + auto stop3b = T3b.Elapsed(); + std::cout << "dpstd native3:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3b << " seconds\n"; + + dpc_common::TimeInterval T3c; + pi = calc_pi_dpstd_native4(num_steps, groups, policy); + auto stop3c = T3c.Elapsed(); + std::cout << "dpstd native4:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3c << " seconds\n"; + + dpc_common::TimeInterval T4; + pi = calc_pi_dpstd_two_steps_lib(num_steps, policy); + auto stop4 = T4.Elapsed(); + std::cout << "dpstd two steps:\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop4 << " seconds\n"; + + dpc_common::TimeInterval T5; + pi = calc_pi_dpstd_onestep(num_steps, policy); + auto stop5 = T5.Elapsed(); + std::cout << "dpstd transform_reduce: "; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop5 << " seconds\n"; + + std::cout << "success\n"; return 0; } From f170e71a8919353a5647e786184f351df93b34bb Mon Sep 17 00:00:00 2001 From: JoeOster <52936608+JoeOster@users.noreply.github.com> Date: Thu, 27 Aug 2020 13:48:51 -0700 Subject: [PATCH 16/23] Update main.cpp --- .../ParallelPatterns/dpc_reduce/src/main.cpp | 506 +++++++++++------- 1 file changed, 323 insertions(+), 183 deletions(-) diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp index e1fea5d761..6d154fc3f8 100755 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp @@ -3,21 +3,24 @@ // // SPDX-License-Identifier: MIT // ============================================================= +#include + #include #include // setprecision library #include -// The include folder is located at %ONEAPI_ROOT%\dev-utilities\latest\include +// The include folder is located at %ONEAPI_ROOT%\dev-utilities\latest\include // on your development system. #include #include #include // dpc_common.hpp can be found in the dev-utilities include folder. -// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp #include "dpc_common.hpp" using namespace sycl; +constexpr int master = 0; // cpu_seq is a simple sequential CPU routine // that calculates all the slices and then @@ -164,125 +167,107 @@ struct slice_area { }; }; - -// a way to get value_type from both accessors and USM that is needed for transform_init +// a way to get value_type from both accessors and USM that is needed for +// transform_init template -struct accessor_traits -{ -}; - -template -struct accessor_traits> -{ - using value_type = typename sycl::accessor::value_type; +struct accessor_traits {}; + +template +struct accessor_traits< + sycl::accessor> { + using value_type = typename sycl::accessor::value_type; }; template -struct accessor_traits -{ - using value_type = RawArrayValueType; +struct accessor_traits { + using value_type = RawArrayValueType; }; // calculate shift where we should start processing on current item -template -SizeN -calc_shift(const NDItemId item_id, const GlobalIdx global_idx, SizeNIter& n_iter, const SizeN n) -{ +template +SizeN calc_shift(const NDItemId item_id, const GlobalIdx global_idx, + SizeNIter& n_iter, const SizeN n) { + auto global_range_size = item_id.get_global_range().size(); + + auto start = n_iter * global_idx; + auto global_shift = global_idx + n_iter * global_range_size; + if (n_iter > 0 && global_shift > n) { + start += n % global_range_size - global_idx; + } else if (global_shift < n) { + n_iter++; + } + return start; +} + +template +struct transform_init { + Operation1 binary_op; + Operation2 unary_op; + + template + void operator()(const NDItemId item_id, const GlobalIdx global_idx, Size n, + AccLocal& local_mem, const Acc&... acc) { + auto local_idx = item_id.get_local_id(0); auto global_range_size = item_id.get_global_range().size(); + auto n_iter = n / global_range_size; + auto start = calc_shift(item_id, global_idx, n_iter, n); + auto shifted_global_idx = global_idx + start; - auto start = n_iter * global_idx; - auto global_shift = global_idx + n_iter * global_range_size; - if (n_iter > 0 && global_shift > n) - { - start += n % global_range_size - global_idx; + typename accessor_traits::value_type res; + if (global_idx < n) { + res = unary_op(shifted_global_idx, acc...); } - else if (global_shift < n) - { - n_iter++; + // Add neighbour to the current local_mem + for (decltype(n_iter) i = 1; i < n_iter; ++i) { + res = binary_op(res, unary_op(shifted_global_idx + i, acc...)); } - return start; -} - - -template -struct transform_init -{ - Operation1 binary_op; - Operation2 unary_op; - - template - void - operator()(const NDItemId item_id, const GlobalIdx global_idx, Size n, AccLocal& local_mem, - const Acc&... acc) - { - auto local_idx = item_id.get_local_id(0); - auto global_range_size = item_id.get_global_range().size(); - auto n_iter = n / global_range_size; - auto start = calc_shift(item_id, global_idx, n_iter, n); - auto shifted_global_idx = global_idx + start; - - typename accessor_traits::value_type res; - if (global_idx < n) - { - res = unary_op(shifted_global_idx, acc...); - } - // Add neighbour to the current local_mem - for (decltype(n_iter) i = 1; i < n_iter; ++i) - { - res = binary_op(res, unary_op(shifted_global_idx + i, acc...)); - } - if (global_idx < n) - { - local_mem[local_idx] = res; - } + if (global_idx < n) { + local_mem[local_idx] = res; } + } }; - // Reduce on local memory template -struct reduce -{ - BinaryOperation1 bin_op1; - - template - Tp - operator()(const NDItemId item_id, const GlobalIdx global_idx, const Size n, AccLocal& local_mem) - { - auto local_idx = item_id.get_local_id(0); - auto group_size = item_id.get_local_range().size(); - - auto k = 1; - do - { - item_id.barrier(sycl::access::fence_space::local_space); - if (local_idx % (2 * k) == 0 && local_idx + k < group_size && global_idx < n && - global_idx + k < n) - { - local_mem[local_idx] = bin_op1(local_mem[local_idx], local_mem[local_idx + k]); - } - k *= 2; - } while (k < group_size); - return local_mem[local_idx]; - } -}; +struct reduce { + BinaryOperation1 bin_op1; + template + Tp operator()(const NDItemId item_id, const GlobalIdx global_idx, + const Size n, AccLocal& local_mem) { + auto local_idx = item_id.get_local_id(0); + auto group_size = item_id.get_local_range().size(); + + auto k = 1; + do { + item_id.barrier(sycl::access::fence_space::local_space); + if (local_idx % (2 * k) == 0 && local_idx + k < group_size && + global_idx < n && global_idx + k < n) { + local_mem[local_idx] = + bin_op1(local_mem[local_idx], local_mem[local_idx + k]); + } + k *= 2; + } while (k < group_size); + return local_mem[local_idx]; + } +}; // walk through the data template -struct walk_n -{ - F f; - - template - auto - operator()(const ItemId idx, Ranges&&... rngs) -> decltype(f(rngs[idx]...)) - { - return f(rngs[idx]...); - } -}; +struct walk_n { + F f; + template + auto operator()(const ItemId idx, Ranges&&... rngs) + -> decltype(f(rngs[idx]...)) { + return f(rngs[idx]...); + } +}; // This option uses a parallel for to fill the buffer and then // uses a tranform_init with plus/no_op and then @@ -316,12 +301,12 @@ float calc_pi_dpstd_native3(size_t num_steps, int groups, Policy&& policy) { // In this example we have done the calculation and filled the buffer above // The way transform_init works is that you need to have the value already // populated in the buffer. - auto tf_init = transform_init, - Functor>{std::plus(), Functor{my_no_op()}}; + auto tf_init = transform_init, Functor>{ + std::plus(), Functor{my_no_op()}}; auto combine = std::plus(); - auto brick_reduce = reduce, float>{ - std::plus()}; + auto brick_reduce = + reduce, float>{std::plus()}; auto workgroup_size = policy.queue() .get_device() @@ -351,8 +336,8 @@ float calc_pi_dpstd_native3(size_t num_steps, int groups, Policy&& policy) { [=](nd_item<1> item_id) mutable { auto global_idx = item_id.get_global_id(0); // 1. Initialization (transform part). - tf_init(item_id, global_idx, num_steps, - temp_buf_local, access_buf); + tf_init(item_id, global_idx, num_steps, temp_buf_local, + access_buf); // 2. Reduce within work group float local_result = brick_reduce( item_id, global_idx, num_steps, temp_buf_local); @@ -417,13 +402,12 @@ float calc_pi_dpstd_native4(size_t num_steps, int groups, Policy&& policy) { // The buffer has 1...num it at and now we will use that as an input // to the slice structue which will calculate the area of each // rectangle. - auto tf_init = transform_init, - Functor2>{ - std::plus(), Functor2{slice_area(num_steps)}}; + auto tf_init = transform_init, Functor2>{ + std::plus(), Functor2{slice_area(num_steps)}}; auto combine = std::plus(); - auto brick_reduce = reduce, float>{ - std::plus()}; + auto brick_reduce = + reduce, float>{std::plus()}; // get workgroup_size from the device auto workgroup_size = @@ -462,8 +446,8 @@ float calc_pi_dpstd_native4(size_t num_steps, int groups, Policy&& policy) { auto global_idx = item_id.get_global_id(0); // 1. Initialization (transform part). Fill local // memory - tf_init(item_id, global_idx, num_steps, - temp_buf_local, access_buf); + tf_init(item_id, global_idx, num_steps, temp_buf_local, + access_buf); // 2. Reduce within work group float local_result = brick_reduce( item_id, global_idx, num_steps, temp_buf_local); @@ -554,83 +538,239 @@ float calc_pi_dpstd_onestep(int num_steps, Policy& policy) { return total; } +//////////////////////////////////////////////////////////////////////// +// +// Each MPI ranks compute the number Pi partially on target device using DPC++. +// The partial result of number Pi is returned in "results". +// +//////////////////////////////////////////////////////////////////////// +void mpi_native(float* results, int rank_num, int num_procs, + long total_num_steps, queue& q) { + int num_step_per_rank = total_num_steps / num_procs; + float dx, dx2; + + dx = 1.0f / (float)total_num_steps; + dx2 = dx / 2.0f; + + default_selector device_selector; + + // exception handler + // + // The exception_list parameter is an iterable list of std::exception_ptr + // objects. But those pointers are not always directly readable. So, we + // rethrow the pointer, catch it, and then we have the exception itself. + // Note: depending upon the operation there may be several exceptions. + auto exception_handler = [&](exception_list exceptionList) { + for (std::exception_ptr const& e : exceptionList) { + try { + std::rethrow_exception(e); + } catch (cl::sycl::exception const& e) { + std::cout << "Failure" + << "\n"; + std::terminate(); + } + } + }; + + try { + // The size of amount of memory that will be given to the buffer. + range<1> num_items{total_num_steps / size_t(num_procs)}; + + // Buffers are used to tell SYCL which data will be shared between the host + // and the devices. + buffer results_buf(results, + range<1>(total_num_steps / size_t(num_procs))); + + // Submit takes in a lambda that is passed in a command group handler + // constructed at runtime. + q.submit([&](handler& h) { + // Accessors are used to get access to the memory owned by the buffers. + auto results_accessor = results_buf.get_access(h); + // Each kernel calculates a partial of the number Pi in parallel. + h.parallel_for(num_items, [=](id<1> k) { + float x = ((float)rank_num / (float)num_procs) + (float)k * dx + dx2; + results_accessor[k] = (4.0f * dx) / (1.0f + x * x); + }); + }); + } catch (...) { + std::cout << "Failure" << std::endl; + } +} + +// This function uses the DPC++ library call transform reduce. +// It does everything in one library call. +template +float mpi_dpstd_onestep(int id, int num_procs, long total_num_steps, + Policy& policy) { + int num_step_per_rank = total_num_steps / num_procs; + float step = 1.0f / (float)total_num_steps; + + float total = std::transform_reduce( + policy, oneapi::dpl::counting_iterator(1), + oneapi::dpl::counting_iterator(num_step_per_rank), 0.0f, + std::plus(), [=](int i) { + float x = ((float)id / (float)num_procs) + i * step - step / 2; + return (4.0f / (1.0f + x * x)); + }); + total = total * (float)step; + + return total; +} + int main(int argc, char** argv) { int num_steps = 1000000; - printf("Number of steps is %d\n", num_steps); int groups = 10000; - + char machine_name[MPI_MAX_PROCESSOR_NAME]; + int name_len; + int id; + int num_procs; float pi; queue myQueue{property::queue::in_order()}; auto policy = oneapi::dpl::execution::make_device_policy( queue(default_selector{}, dpc_common::exception_handler)); - // Since we are using JIT compiler for samples, - // we need to run each step once to allow for compile - // to occur before we time execution of function. - pi = calc_pi_dpstd_native(num_steps, policy); - pi = calc_pi_dpstd_native2(num_steps, policy, groups); - pi = calc_pi_dpstd_native3(num_steps, groups, policy); - pi = calc_pi_dpstd_native4(num_steps, groups, policy); - - pi = calc_pi_dpstd_two_steps_lib(num_steps, policy); - pi = calc_pi_dpstd_onestep(num_steps, policy); - - dpc_common::TimeInterval T; - pi = calc_pi_cpu_seq(num_steps); - auto stop = T.Elapsed(); - std::cout << "Cpu Seq calc: \t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop << " seconds\n"; - - dpc_common::TimeInterval T2; - pi = calc_pi_cpu_tbb(num_steps); - auto stop2 = T2.Elapsed(); - std::cout << "Cpu TBB calc: \t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop2 << " seconds\n"; - - dpc_common::TimeInterval T3; - pi = calc_pi_dpstd_native(num_steps, policy); - auto stop3 = T3.Elapsed(); - std::cout << "dpstd native:\t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop3 << " seconds\n"; - - dpc_common::TimeInterval T3a; - pi = calc_pi_dpstd_native2(num_steps, policy, groups); - auto stop3a = T3a.Elapsed(); - std::cout << "dpstd native2:\t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop3a << " seconds\n"; - - dpc_common::TimeInterval T3b; - pi = calc_pi_dpstd_native3(num_steps, groups, policy); - auto stop3b = T3b.Elapsed(); - std::cout << "dpstd native3:\t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop3b << " seconds\n"; - - dpc_common::TimeInterval T3c; - pi = calc_pi_dpstd_native4(num_steps, groups, policy); - auto stop3c = T3c.Elapsed(); - std::cout << "dpstd native4:\t\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop3c << " seconds\n"; - - dpc_common::TimeInterval T4; - pi = calc_pi_dpstd_two_steps_lib(num_steps, policy); - auto stop4 = T4.Elapsed(); - std::cout << "dpstd two steps:\t"; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop4 << " seconds\n"; - - dpc_common::TimeInterval T5; - pi = calc_pi_dpstd_onestep(num_steps, policy); - auto stop5 = T5.Elapsed(); - std::cout << "dpstd transform_reduce: "; - std::cout << std::setprecision(3) << "PI =" << pi; - std::cout << " in " << stop5 << " seconds\n"; - - std::cout << "success\n"; + // Start MPI. + if (MPI_Init(&argc, &argv) != MPI_SUCCESS) { + std::cout << "Failed to initialize MPI\n"; + exit(-1); + } + + // Create the communicator, and retrieve the number of MPI ranks. + MPI_Comm_size(MPI_COMM_WORLD, &num_procs); + + // Determine the rank number. + MPI_Comm_rank(MPI_COMM_WORLD, &id); + + // Get the machine name. + MPI_Get_processor_name(machine_name, &name_len); + + std::cout << "Rank #" << id << " runs on: " << machine_name + << ", uses device: " + << myQueue.get_device().get_info() << "\n"; + + if (id == master) { + printf("Number of steps is %d\n", num_steps); + + // Since we are using JIT compiler for samples, + // we need to run each step once to allow for compile + // to occur before we time execution of function. + pi = calc_pi_dpstd_native(num_steps, policy); + pi = calc_pi_dpstd_native2(num_steps, policy, groups); + pi = calc_pi_dpstd_native3(num_steps, groups, policy); + pi = calc_pi_dpstd_native4(num_steps, groups, policy); + + pi = calc_pi_dpstd_two_steps_lib(num_steps, policy); + pi = calc_pi_dpstd_onestep(num_steps, policy); + + dpc_common::TimeInterval T; + pi = calc_pi_cpu_seq(num_steps); + auto stop = T.Elapsed(); + std::cout << "Cpu Seq calc: \t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop << " seconds\n"; + + dpc_common::TimeInterval T2; + pi = calc_pi_cpu_tbb(num_steps); + auto stop2 = T2.Elapsed(); + std::cout << "Cpu TBB calc: \t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop2 << " seconds\n"; + + dpc_common::TimeInterval T3; + pi = calc_pi_dpstd_native(num_steps, policy); + auto stop3 = T3.Elapsed(); + std::cout << "dpstd native:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3 << " seconds\n"; + + dpc_common::TimeInterval T3a; + pi = calc_pi_dpstd_native2(num_steps, policy, groups); + auto stop3a = T3a.Elapsed(); + std::cout << "dpstd native2:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3a << " seconds\n"; + + dpc_common::TimeInterval T3b; + pi = calc_pi_dpstd_native3(num_steps, groups, policy); + auto stop3b = T3b.Elapsed(); + std::cout << "dpstd native3:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3b << " seconds\n"; + + dpc_common::TimeInterval T3c; + pi = calc_pi_dpstd_native4(num_steps, groups, policy); + auto stop3c = T3c.Elapsed(); + std::cout << "dpstd native4:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3c << " seconds\n"; + + dpc_common::TimeInterval T4; + pi = calc_pi_dpstd_two_steps_lib(num_steps, policy); + auto stop4 = T4.Elapsed(); + std::cout << "dpstd two steps:\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop4 << " seconds\n"; + + dpc_common::TimeInterval T5; + pi = calc_pi_dpstd_onestep(num_steps, policy); + auto stop5 = T5.Elapsed(); + std::cout << "dpstd transform_reduce: "; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop5 << " seconds\n"; + } + + int num_step_per_rank = num_steps / num_procs; + float* results_per_rank = new float[num_step_per_rank]; + + // Initialize an array to store a partial result per rank. + for (size_t i = 0; i < num_step_per_rank; i++) results_per_rank[i] = 0.0; + + dpc_common::TimeInterval T6; + // Calculate the Pi number partially by multiple MPI ranks. + mpi_native(results_per_rank, id, num_procs, num_steps, myQueue); + + float local_sum = 0.0; + + // Use the DPC++ library call to reduce the array using plus + buffer calc_values(results_per_rank, num_step_per_rank); + auto calc_begin2 = dpstd::begin(calc_values); + auto calc_end2 = dpstd::end(calc_values); + + local_sum = + std::reduce(policy, calc_begin2, calc_end2, 0.0f, std::plus()); + policy.queue().wait(); + + // Master rank performs a reduce operation to get the sum of all partial Pi. + MPI_Reduce(&local_sum, &pi, 1, MPI_FLOAT, MPI_SUM, master, MPI_COMM_WORLD); + + if (id == master) { + auto stop6 = T6.Elapsed(); + + std::cout << "mpi native:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop6 << " seconds\n"; + } + + delete[] results_per_rank; + + // mpi_dpstd_onestep + dpc_common::TimeInterval T7; + local_sum = mpi_dpstd_onestep(id, num_procs, num_steps, policy); + auto stop7 = T7.Elapsed(); + + // Master rank performs a reduce operation to get the sum of all partial Pi. + MPI_Reduce(&local_sum, &pi, 1, MPI_FLOAT, MPI_SUM, master, MPI_COMM_WORLD); + + if (id == master) { + auto stop6 = T7.Elapsed(); + + std::cout << "mpi transform_reduce:\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop7 << " seconds\n"; + std::cout << "succes\n"; + } + + MPI_Finalize(); + return 0; } From 7a8b9619560eb61a6b8453f41b6952c31b8c63e4 Mon Sep 17 00:00:00 2001 From: JoeOster <52936608+JoeOster@users.noreply.github.com> Date: Thu, 27 Aug 2020 13:49:25 -0700 Subject: [PATCH 17/23] Update README.md --- .../ParallelPatterns/dpc_reduce/README.md | 52 +++++++++++-------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md index fa48829608..212b4011ce 100644 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md @@ -1,13 +1,13 @@ # dpc_reduce Sample -The dpc_reduce is a simple program that calculates pi. This program is implemented using C++ and Data Parallel C++ (DPC++) for Intel(R) CPU and accelerators. +The dpc_reduce is a simple program that calculates pi. This program is implemented using C++ and Data Parallel C++ (DPC++) for Intel(R) CPU and accelerators. This code sample also demonstrates how to incorporate Data Parallel PC++ into +a MPI program. For comprehensive instructions regarding DPC++ Programming, go to https://software.intel.com/en-us/oneapi-programming-guide and search based on relevant terms noted in the comments. - | Optimized for | Description -|:--- |:--- +|:--- |:--- | OS | Linux* Ubuntu* 18.04, | Hardware | Skylake with GEN9 or newer, | Software | Intel® oneAPI DPC++ Compiler (beta) @@ -31,6 +31,12 @@ tiny rectangles and then summing up the results. The parallel computations are performed using oneTBB and oneAPI DPC++ library (oneDPL). +This example also demonstrates how to incorporate Data Parallel PC++ into a MPI program. +Using Data Parallel C++, the code sample runs multiple MPI ranks to distribute the +calculation of the number Pi. Each rank offloads the computation to an accelerator +(GPU/CPU) using Intel DPC++ compiler to compute a partial compution of the number Pi. + + ## Key Implementation Details The basic DPC++ implementation explained in the code includes accessor, kernels, queues, buffers as well as some oneDPL library calls. @@ -48,14 +54,15 @@ If running a sample in the Intel DevCloud, remember that you must specify the co ### On a Linux* System Perform the following steps: -1. Build the program using the following 'cmake' commands +1. Build the program using the following 'cmake' commands +export I_MPI_CXX=dpcpp mkdir build cd build cmake .. make 2. Run the program using: -make run or src/dpc_reduce +make run or './dpc_reduce' or 'mpirun ./dpc_reduce' 3. Clean the program using: make clean @@ -63,27 +70,26 @@ make clean ## Running the Sample ### Application Parameters -There are no editable parameters for this sample. - -### Example of Output -Number of steps is 1000000 - -Cpu Seq calc: PI =3.14 in 0.00348 seconds -Cpu TBB calc: PI =3.14 in 0.00178 seconds + Usage: mpirun -n ./dpc_reduce -dpstd native: PI =3.14 in 0.191 seconds +where -dpstd native2: PI =3.14 in 0.142 seconds + : number of MPI rank. -dpstd native3: PI =3.14 in 0.002 seconds - -dpstd native4: PI =3.14 in 0.00234 seconds - -dpstd two steps: PI =3.14 in 0.00138 seconds - -dpstd transform_reduce: PI =3.14 in 0.000442 seconds - -success +### Example of Output +Rank #0 runs on: lqnguyen-NUC1, uses device: Intel(R) Gen9 HD Graphics NEO +Number of steps is 1000000 +Cpu Seq calc: PI =3.14 in 0.00422 seconds +Cpu TBB calc: PI =3.14 in 0.00177 seconds +dpstd native: PI =3.14 in 0.209 seconds +dpstd native2: PI =3.14 in 0.213 seconds +dpstd native3: PI =3.14 in 0.00222 seconds +dpstd native4: PI =3.14 in 0.00237 seconds +dpstd two steps: PI =3.14 in 0.0014 seconds +dpstd transform_reduce: PI =3.14 in 0.000528 seconds +mpi native: PI =3.14 in 0.548 seconds +mpi transform_reduce: PI =3.14 in 0.000498 seconds +succes From c71b30591505b87f83304ca2c32530d6d90c0645 Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Thu, 27 Aug 2020 15:30:37 -0700 Subject: [PATCH 18/23] Update CXX to icpx and compiler option for beta09. Signed-off-by: Loc Nguyen --- .../DPC++/DenseLinearAlgebra/matrix_mul/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/Makefile b/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/Makefile index ed548d7e17..13c3318b91 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/Makefile +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/Makefile @@ -4,8 +4,8 @@ DPCPP_LDFLAGS = DPCPP_EXE_NAME = matrix_mul_dpc DPCPP_SOURCES = src/matrix_mul_dpcpp.cpp -CXX = icc -OMP_CXXFLAGS = -qnextgen -fiopenmp -fopenmp-targets=spir64 -D__STRICT_ANSI__ -g -o +CXX = icpx +OMP_CXXFLAGS = -fiopenmp -fopenmp-targets=spir64 -D__STRICT_ANSI__ -g -o OMP_LDFLAGS = OMP_EXE_NAME = matrix_mul_omp OMP_SOURCES = src/matrix_mul_omp.cpp From 0507e5ebea49ea331565edacc6991e342e9b5619 Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Fri, 28 Aug 2020 09:55:52 -0700 Subject: [PATCH 19/23] Add "export I_MPI_CXX=dpcpp" in sample.json file. Signed-off-by: Loc Nguyen --- .../DPC++/ParallelPatterns/dpc_reduce/sample.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json index cbce8a1dfc..5302acc385 100644 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json @@ -15,7 +15,8 @@ { "id": "dpc_reduce", "steps": [ - "mkdir build", + "export I_MPI_CXX=dpcpp", + "mkdir build", "cd build", "cmake ..", "make", From ca398c65beb6b5a537ed13a9e04d62640c6926c3 Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Thu, 17 Sep 2020 18:10:59 -0700 Subject: [PATCH 20/23] Update json file. --- DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json index def268a2f8..71a54aeb58 100644 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json @@ -1,7 +1,7 @@ { "guid": "5D274319-02EE-44B0-B055-71E4C50D05E0", "name": "PrefixSum", - "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ], + "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/Intel® oneAPI DPC++/C++ Compiler/CPU and GPU" ], "description": "Compute Prefix Sum using Intel® oneAPI DPC++ Language", "toolchain": [ "dpcpp" ], "targetDevice": [ "CPU", "GPU" ], From d80d94ab413cc1c1da1ae52cb12e8faec181e9ad Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Thu, 24 Sep 2020 14:00:57 -0700 Subject: [PATCH 21/23] Sync with master. --- .../ParallelPatterns/dpc_reduce/src/main.cpp | 203 ++++++++++-------- 1 file changed, 111 insertions(+), 92 deletions(-) diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp index a7b559507a..4fd92b0e1c 100755 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp @@ -169,108 +169,126 @@ struct slice_area { }; }; -// a way to get value_type from both accessors and USM that is needed for -// transform_init + +// a way to get value_type from both accessors and USM that is needed for transform_init template -struct accessor_traits {}; - -template -struct accessor_traits< - sycl::accessor> { - using value_type = typename sycl::accessor::value_type; +struct accessor_traits +{ +}; + +template +struct accessor_traits> +{ + using value_type = typename sycl::accessor::value_type; }; template -struct accessor_traits { - using value_type = RawArrayValueType; +struct accessor_traits +{ + using value_type = RawArrayValueType; }; // calculate shift where we should start processing on current item -template -SizeN calc_shift(const NDItemId item_id, const GlobalIdx global_idx, - SizeNIter& n_iter, const SizeN n) { - auto global_range_size = item_id.get_global_range().size(); - - auto start = n_iter * global_idx; - auto global_shift = global_idx + n_iter * global_range_size; - if (n_iter > 0 && global_shift > n) { - start += n % global_range_size - global_idx; - } else if (global_shift < n) { - n_iter++; - } - return start; -} - -template -struct transform_init { - Operation1 binary_op; - Operation2 unary_op; - - template - void operator()(const NDItemId item_id, const GlobalIdx global_idx, Size n, - AccLocal& local_mem, const Acc&... acc) { - auto local_idx = item_id.get_local_id(0); +template +SizeN +calc_shift(const NDItemId item_id, const GlobalIdx global_idx, SizeNIter& n_iter, const SizeN n) +{ auto global_range_size = item_id.get_global_range().size(); - auto n_iter = n / global_range_size; - auto start = calc_shift(item_id, global_idx, n_iter, n); - auto shifted_global_idx = global_idx + start; - typename accessor_traits::value_type res; - if (global_idx < n) { - res = unary_op(shifted_global_idx, acc...); + auto start = n_iter * global_idx; + auto global_shift = global_idx + n_iter * global_range_size; + if (n_iter > 0 && global_shift > n) + { + start += n % global_range_size - global_idx; } - // Add neighbour to the current local_mem - for (decltype(n_iter) i = 1; i < n_iter; ++i) { - res = binary_op(res, unary_op(shifted_global_idx + i, acc...)); + else if (global_shift < n) + { + n_iter++; } - if (global_idx < n) { - local_mem[local_idx] = res; + return start; +} + + +template +struct transform_init +{ + Operation1 binary_op; + Operation2 unary_op; + + template + void + operator()(const NDItemId item_id, const GlobalIdx global_idx, Size n, AccLocal& local_mem, + const Acc&... acc) + { + auto local_idx = item_id.get_local_id(0); + auto global_range_size = item_id.get_global_range().size(); + auto n_iter = n / global_range_size; + auto start = calc_shift(item_id, global_idx, n_iter, n); + auto shifted_global_idx = global_idx + start; + + typename accessor_traits::value_type res; + if (global_idx < n) + { + res = unary_op(shifted_global_idx, acc...); + } + // Add neighbour to the current local_mem + for (decltype(n_iter) i = 1; i < n_iter; ++i) + { + res = binary_op(res, unary_op(shifted_global_idx + i, acc...)); + } + if (global_idx < n) + { + local_mem[local_idx] = res; + } } - } }; + // Reduce on local memory template -struct reduce { - BinaryOperation1 bin_op1; - - template - Tp operator()(const NDItemId item_id, const GlobalIdx global_idx, - const Size n, AccLocal& local_mem) { - auto local_idx = item_id.get_local_id(0); - auto group_size = item_id.get_local_range().size(); - - auto k = 1; - do { - item_id.barrier(sycl::access::fence_space::local_space); - if (local_idx % (2 * k) == 0 && local_idx + k < group_size && - global_idx < n && global_idx + k < n) { - local_mem[local_idx] = - bin_op1(local_mem[local_idx], local_mem[local_idx + k]); - } - k *= 2; - } while (k < group_size); - return local_mem[local_idx]; - } +struct reduce +{ + BinaryOperation1 bin_op1; + + template + Tp + operator()(const NDItemId item_id, const GlobalIdx global_idx, const Size n, AccLocal& local_mem) + { + auto local_idx = item_id.get_local_id(0); + auto group_size = item_id.get_local_range().size(); + + auto k = 1; + do + { + item_id.barrier(sycl::access::fence_space::local_space); + if (local_idx % (2 * k) == 0 && local_idx + k < group_size && global_idx < n && + global_idx + k < n) + { + local_mem[local_idx] = bin_op1(local_mem[local_idx], local_mem[local_idx + k]); + } + k *= 2; + } while (k < group_size); + return local_mem[local_idx]; + } }; + // walk through the data template -struct walk_n { - F f; - - template - auto operator()(const ItemId idx, Ranges&&... rngs) - -> decltype(f(rngs[idx]...)) { - return f(rngs[idx]...); - } +struct walk_n +{ + F f; + + template + auto + operator()(const ItemId idx, Ranges&&... rngs) -> decltype(f(rngs[idx]...)) + { + return f(rngs[idx]...); + } }; + // This option uses a parallel for to fill the buffer and then // uses a tranform_init with plus/no_op and then // a local reduction then global reduction. @@ -304,12 +322,12 @@ float calc_pi_dpstd_native3(size_t num_steps, int groups, Policy&& policy) { // In this example we have done the calculation and filled the buffer above // The way transform_init works is that you need to have the value already // populated in the buffer. - auto tf_init = transform_init, Functor>{ - std::plus(), Functor{my_no_op()}}; + auto tf_init = transform_init, + Functor>{std::plus(), Functor{my_no_op()}}; auto combine = std::plus(); - auto brick_reduce = - reduce, float>{std::plus()}; + auto brick_reduce = reduce, float>{ + std::plus()}; auto workgroup_size = policy.queue() .get_device() @@ -338,8 +356,8 @@ float calc_pi_dpstd_native3(size_t num_steps, int groups, Policy&& policy) { [=](nd_item<1> item_id) mutable { auto global_idx = item_id.get_global_id(0); // 1. Initialization (transform part). - tf_init(item_id, global_idx, num_steps, temp_buf_local, - access_buf); + tf_init(item_id, global_idx, num_steps, + temp_buf_local, access_buf); // 2. Reduce within work group float local_result = brick_reduce( item_id, global_idx, num_steps, temp_buf_local); @@ -402,12 +420,13 @@ float calc_pi_dpstd_native4(size_t num_steps, int groups, Policy&& policy) { // The buffer has 1...num it at and now we will use that as an input // to the slice structue which will calculate the area of each // rectangle. - auto tf_init = transform_init, Functor2>{ - std::plus(), Functor2{slice_area(num_steps)}}; + auto tf_init = transform_init, + Functor2>{ + std::plus(), Functor2{slice_area(num_steps)}}; auto combine = std::plus(); - auto brick_reduce = - reduce, float>{std::plus()}; + auto brick_reduce = reduce, float>{ + std::plus()}; // get workgroup_size from the device auto workgroup_size = @@ -445,8 +464,8 @@ float calc_pi_dpstd_native4(size_t num_steps, int groups, Policy&& policy) { auto global_idx = item_id.get_global_id(0); // 1. Initialization (transform part). Fill local // memory - tf_init(item_id, global_idx, num_steps, temp_buf_local, - access_buf); + tf_init(item_id, global_idx, num_steps, + temp_buf_local, access_buf); // 2. Reduce within work group float local_result = brick_reduce( item_id, global_idx, num_steps, temp_buf_local); From c56d424edfd2e2d31aa05f14661e25343c059a1c Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Thu, 1 Oct 2020 19:54:29 -0700 Subject: [PATCH 22/23] Update 1d_HeatTransfer code sample according to the new guideline. --- .../1d_HeatTransfer/src/1d_HeatTransfer.cpp | 75 +++++++++---------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp index 83f3405ba3..fcb2b1ed47 100644 --- a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp @@ -34,12 +34,13 @@ // //****************************************************************************** #include -#include #include #include #include +#include "dpc_common.hpp" using namespace sycl; +using namespace std; constexpr float dt = 0.002f; constexpr float dx = 0.01f; @@ -49,12 +50,12 @@ constexpr float temp = 100.0f; // Initial temperature. //************************************ // Function description: display input parameters used for this sample. //************************************ -void Usage(std::string programName) { - std::cout << " Incorrect parameters \n"; - std::cout << " Usage: "; - std::cout << programName << " \n\n"; - std::cout << " n : Number of points to simulate \n"; - std::cout << " i : Number of timesteps \n"; +void Usage(string programName) { + cout << " Incorrect parameters \n"; + cout << " Usage: "; + cout << programName << " \n\n"; + cout << " n : Number of points to simulate \n"; + cout << " i : Number of timesteps \n"; } //************************************ @@ -75,43 +76,40 @@ float* ComputeHeatDeviceParallel(float* arr, float* arr_next, float C, try { // Define the device queue queue q = default_selector{}; - std::cout << "Kernel runs on " - << q.get_device().get_info() << "\n"; + cout << "Kernel runs on " << q.get_device().get_info() + << "\n"; // Set boundary condition at one end. arr[0] = arr_next[0] = temp; float* current_data_ptr = arr; float* next_data_ptr = arr_next; - // current_data_ptr = arr; - // next_data_ptr = arr_next; // Buffer scope { - buffer arr_buf(current_data_ptr, range<1>{num_p + 2}); - buffer arr_next_buf(next_data_ptr, range<1>{num_p + 2}); + buffer temperature_buf(current_data_ptr, range(num_p + 2)); + buffer temperature_next_buf(next_data_ptr, range(num_p + 2)); // Iterate over timesteps for (i = 1; i <= num_iter; i++) { if (i % 2 != 0) { - q.submit([&](handler& h) { + q.submit([&](auto& h) { // The size of memory amount that will be given to the buffer. range<1> num_items{num_p + 2}; - auto arr_acc = arr_buf.get_access(h); - auto arr_next_acc = - arr_next_buf.get_access(h); + accessor temperature(temperature_buf, h); + accessor temperature_next(temperature_next_buf, h); h.parallel_for(num_items, [=](id<1> k) { size_t gid = k.get(0); if (gid == 0) { } else if (gid == num_p + 1) { - arr_next_acc[k] = arr_acc[k - 1]; + temperature_next[k] = temperature[k - 1]; } else { - arr_next_acc[k] = - C * (arr_acc[k + 1] - 2 * arr_acc[k] + arr_acc[k - 1]) + - arr_acc[k]; + temperature_next[k] = + C * (temperature[k + 1] - 2 * temperature[k] + temperature[k - 1]) + + temperature[k]; } }); // end parallel for loop in kernel1 }); // end device queue @@ -121,20 +119,19 @@ float* ComputeHeatDeviceParallel(float* arr, float* arr_next, float C, // The size of memory amount that will be given to the buffer. range<1> num_items{num_p + 2}; - auto arr_acc = arr_buf.get_access(h); - auto arr_next_acc = - arr_next_buf.get_access(h); + accessor temperature(temperature_buf, h); + accessor temperature_next(temperature_next_buf, h); h.parallel_for(num_items, [=](id<1> k) { size_t gid = k.get(0); if (gid == 0) { } else if (gid == num_p + 1) { - arr_acc[k] = arr_next_acc[k - 1]; + temperature[k] = temperature_next[k - 1]; } else { - arr_acc[k] = C * (arr_next_acc[k + 1] - 2 * arr_next_acc[k] + - arr_next_acc[k - 1]) + - arr_next_acc[k]; + temperature[k] = C * (temperature_next[k + 1] - 2 * temperature_next[k] + + temperature_next[k - 1]) + + temperature_next[k]; } }); // end parallel for loop in kernel2 }); // end device queue @@ -144,8 +141,8 @@ float* ComputeHeatDeviceParallel(float* arr, float* arr_next, float C, q.wait_and_throw(); - } catch (cl::sycl::exception e) { - std::cout << "SYCL exception caught: " << e.what() << "\n"; + } catch (sycl::exception e) { + cout << "SYCL exception caught: " << e.what() << "\n"; } if (i % 2 != 0) @@ -197,7 +194,7 @@ bool CompareResults(float* device_results, float* host_results, double norm2 = 0; bool err = false; - std::ofstream err_file; + ofstream err_file; err_file.open("error_diff.txt"); err_file << " \t idx\theat[i]\t\theat_CPU[i] \n"; @@ -225,16 +222,16 @@ int main(int argc, char* argv[]) { // Read input parameters try { - n_point = std::stoi(argv[1]); - n_iteration = std::stoi(argv[2]); + n_point = stoi(argv[1]); + n_iteration = stoi(argv[2]); } catch (...) { Usage(argv[0]); return (-1); } - std::cout << "Number of points: " << n_point << "\n"; - std::cout << "Number of iterations: " << n_iteration << "\n"; + cout << "Number of points: " << n_point << "\n"; + cout << "Number of iterations: " << n_iteration << "\n"; // Array heat and heat_next arrays store temperatures of the current and next // iteration of n_point (calculated in kernel) @@ -260,7 +257,7 @@ int main(int argc, char* argv[]) { ComputeHeatDeviceParallel(heat, heat_next, C, n_point, n_iteration, temp); // Display time used by device - std::cout << "Kernel time: " << t_par.Elapsed() << " sec\n"; + cout << "Elapsed time: " << t_par.Elapsed() << " sec\n"; // Compute heat in CPU in (for comparision) float* final_CPU = NULL; @@ -273,10 +270,10 @@ int main(int argc, char* argv[]) { bool err = CompareResults(final_device, final_CPU, n_point, C); if (err == true) - std::cout << "Please check the error_diff.txt file ...\n"; + cout << "Please check the error_diff.txt file ...\n"; else - std::cout << "PASSED! There is no difference between the results computed " - "in host and in kernel.\n"; + cout << "PASSED! There is no difference between the results computed " + "in host and in kernel.\n"; // Cleanup delete[] heat; From 70e159fc2b0d4b402a91f07f1f4f5f53e68dca37 Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Fri, 2 Oct 2020 11:45:54 -0700 Subject: [PATCH 23/23] Add comment about dpc_common.hpp . Signed-off-by: Loc Nguyen --- .../StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp index fcb2b1ed47..7b9827753b 100644 --- a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp @@ -37,6 +37,8 @@ #include #include #include +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp #include "dpc_common.hpp" using namespace sycl;