From a015f1e5762ba3618d648616de192a5b05d9db01 Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Mon, 13 Jul 2020 15:03:48 -0700 Subject: [PATCH 1/9] Add bitonic-sort sample. --- .../bitonic-sort/CMakeLists.txt | 30 +++ .../GraphTraversal/bitonic-sort/License.txt | 7 + .../GraphTraversal/bitonic-sort/README.md | 120 +++++++++ .../bitonic-sort/bitonic-sort.sln | 25 ++ .../bitonic-sort/bitonic-sort.vcxproj | 137 ++++++++++ .../bitonic-sort/bitonic-sort.vcxproj.filters | 22 ++ .../bitonic-sort/bitonic-sort.vcxproj.user | 11 + .../GraphTraversal/bitonic-sort/sample.json | 29 ++ .../bitonic-sort/src/bitonic-sort.cpp | 253 ++++++++++++++++++ 9 files changed, 634 insertions(+) create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/CMakeLists.txt create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.sln create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.filters create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.user create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json create mode 100644 DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/CMakeLists.txt b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/CMakeLists.txt new file mode 100644 index 0000000000..b9c9718926 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/CMakeLists.txt @@ -0,0 +1,30 @@ +# required cmake version +cmake_minimum_required(VERSION 3.5) + +project (bitonic-sort) + +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp-cl") +else() + set(CMAKE_CXX_COMPILER "dpcpp") +endif() + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17") + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl") + +add_executable (bitonic-sort src/bitonic-sort.cpp) + +add_custom_target (run + COMMAND bitonic-sort 21 47 + WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} +) + diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt new file mode 100644 index 0000000000..6e9524bd74 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt @@ -0,0 +1,7 @@ +Copyright 2020 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md new file mode 100644 index 0000000000..8066fd1f31 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md @@ -0,0 +1,120 @@ +# `Bitonic Sort` sample + +This code sample demonstrates the implementation of bitonic sort using Intel Data Parallel C++ to +offload the computation to a GPU. In this implementation, a random sequence of 2**n elements is given +(n is a positive number) as input, the algorithm sorts the sequence in parallel. The result sequence is +in ascending order. + +For comprehensive instructions regarding DPC++ Programming, go to +https://software.intel.com/en-us/oneapi-programming-guide +and search based on relevant terms noted in the comments. + +| Optimized for | Description +|:--- |:--- +| OS | Linux Ubuntu 18.04 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) +| What you will learn | Implement bitonic sort using Intel DPC++ compiler +| Time to complete | 15 minutes + + +## Purpose + +The algorithm converts a randomized sequence of numbers into +a bitonic sequence (two ordered sequences), and then merge these two ordered +sequences into a ordered sequence. Bitonic sort algorithm is briefly +described as followed: + +- First, it decomposes the randomized sequence of size 2\*\*n into 2\*\*(n-1) +pairs where each pair consists of 2 consecutive elements. Note that each pair +is a bitonic sequence. +- Step 0: for each pair (sequence of size 2), the two elements are swapped so +that the two consecutive pairs form a bitonic sequence in increasing order, +the next two pairs form the second bitonic sequence in decreasing order, the +next two pairs form the third bitonic sequence in increasing order, etc, .... +At the end of this step, we have 2\*\*(n-1) bitonic sequences of size 2, and +they follow an order increasing, decreasing, increasing, .., decreasing. +Thus, they form 2\*\*(n-2) bitonic sequences of size 4. +- Step 1: for each new 2\*\*(n-2) bitonic sequences of size 4, (each new +sequence consists of 2 consecutive previous sequences), it swaps the elements +so that at the end of step 1, we have 2\*\*(n-2) bitonic sequences of size 4, +and they follow an order: increasing, decreasing, increasing, ..., +decreasing. Thus, they form 2\*\*(n-3) bitonic sequences of size 8. +- Same logic applies until we reach the last step. +- Step n: at this last step, we have one bitonic sequence of size 2\*\*n. The +elements in the sequence are swapped until we have a sequence in increasing +oder. + +The code will attempt first to execute on an available GPU and fallback to the system's CPU +if a compatible GPU is not detected. + +## Key Implementation Details + +The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command g +roups. Unified Shared Memory (USM) is used for data management. + +## License +This code sample is licensed under MIT license + +## Building the `bitonic-sort` Program for CPU and GPU + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, +FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI +Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System +1. Build the program using the following `cmake` commands. + ``` + $ cd bitonic-sort + $ mkdir build + $ cd build + $ cmake .. + $ make + ``` + +2. Run the program: + ``` + make run + ``` + +3. Clean the program using: + ``` + make clean + ``` + +### On a Windows* System + * Build the program using VS2017 or VS2019 + Right click on the solution file and open using either VS2017 or VS2019 IDE. + Right click on the project in Solution explorer and select Rebuild. + From top menu select Debug -> Start without Debugging. + + * Build the program using MSBuild + Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for + VS2019" + Run - MSBuild bitonic-sort.sln /t:Rebuild /p:Configuration="Release" + +## Running the sample +### Application Parameters + + Usage: bitonic-sort + +where + +exponent is a positive number. The according length of the sequence is 2**exponent. + +seed is the seed used by the random generator to generate the randomness. + +The sample offloads the computation to GPU and then performs the computation in serial in the CPU. +The results from the parallel and serial computation are compared. If the results are matched and +the ascending order is verified, the application will display a “Success!” message. + +### Example of Output +``` +$ ./bitonic-sort 21 47 +Array size: 2097152, seed: 47 +Device: Intel(R) Gen9 HD Graphics NEO +Kernel time: 0.416827 sec +CPU serial time: 0.60523 sec +Success! +``` \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.sln b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.sln new file mode 100644 index 0000000000..e558305981 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.1062 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bitonic-sort", "bitonic-sort.vcxproj", "{46454D0B-76F3-45EB-A186-F315A2E22DEA}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Debug|x64.ActiveCfg = Debug|x64 + {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Debug|x64.Build.0 = Debug|x64 + {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Release|x64.ActiveCfg = Release|x64 + {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {B1D84B81-F5D5-4459-AA6E-38B695FB908B} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj new file mode 100644 index 0000000000..9289102064 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj @@ -0,0 +1,137 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + 15.0 + {46454d0b-76f3-45eb-a186-f315a2e22dea} + Win32Proj + bitonic_sort + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + + + + + Console + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + + + + + + + + + + + Console + true + true + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.filters b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.filters new file mode 100644 index 0000000000..82a4ddcfc9 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.filters @@ -0,0 +1,22 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.user b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.user new file mode 100644 index 0000000000..582de7464a --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.user @@ -0,0 +1,11 @@ + + + + 21 47 + WindowsLocalDebugger + + + 21 47 + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json new file mode 100644 index 0000000000..c382d764e1 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json @@ -0,0 +1,29 @@ +{ + "guid": "4D5B57B8-6F34-4A11-89F5-3F07E766DB39", + "name": "bitonic-sort", + "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ], + "description": "Bitonic Sort using Intel® oneAPI DPC++ Language", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild bitonic-sort.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "bitonic-sort.exe 21 47" + ] + }] + } +} diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp new file mode 100644 index 0000000000..e0e4312520 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp @@ -0,0 +1,253 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Bitonic Sort: this algorithm converts a randomized sequence of numbers into +// a bitonic sequence (two ordered sequences), and then merge these two ordered +// sequences into a ordered sequence. Bitonic sort algorithm is briefly +// described as followed: +// +// - First, it decomposes the randomized sequence of size 2**n into 2**(n-1) +// pairs where each pair consists of 2 consecutive elements. Note that each pair +// is a bitonic sequence. +// - Step 0: for each pair (sequence of size 2), the two elements are swapped so +// that the two consecutive pairs form a bitonic sequence in increasing order, +// the next two pairs form the second bitonic sequence in decreasing order, the +// next two pairs form the third bitonic sequence in increasing order, etc, ... +// . At the end of this step, we have 2**(n-1) bitonic sequences of size 2, and +// they follow an order increasing, decreasing, increasing, .., decreasing. +// Thus, they form 2**(n-2) bitonic sequences of size 4. +// - Step 1: for each new 2**(n-2) bitonic sequences of size 4, (each new +// sequence consists of 2 consecutive previous sequences), it swaps the elements +// so that at the end of step 1, we have 2**(n-2) bitonic sequences of size 4, +// and they follow an order: increasing, decreasing, increasing, ..., +// decreasing. Thus, they form 2**(n-3) bitonic sequences of size 8. +// - Same logic applies until we reach the last step. +// - Step n: at this last step, we have one bitonic sequence of size 2**n. The +// elements in the sequence are swapped until we have a sequence in increasing +// oder. +// +// In this implementation, a randomized sequence of size 2**n is given (n is a +// positive number). Unified Shared Memory (USM) is used for data management. At +// each stage, a part of step, the host redefines the ordered sequenes and sends +// data to the kernel. The kernel swaps the elements accordingly in parallel. +// +#include +#include +#include + +using namespace sycl; +using namespace std; + +void ParallelBitonicSort(int a[], int n, queue &q) { + // n: the exponent used to set the array size. Array size = power(2, n) + int size = pow(2, n); + + // step from 0, 1, 2, ...., n-1 + for (int step = 0; step < n; step++) { + // for each step s, stage goes s, s-1, ..., 0 + for (int stage = step; stage >= 0; stage--) { + // In each state, construct a number (num_seq) of bitonic sequences of + // size seq_len (2, 4, ...) num_seq stores the number of bitonic sequences + // at each stage. seq_len stores the length of the bitonic sequence at + // each stage. + int seq_len = pow(2, stage + 1); +#if DEBUG + int num_seq = pow(2, (n - stage - 1)); // Used for debug purpose. + std::cout << "step num:" << step << " stage num:" << stage + << " num_seq:" << num_seq << "(" << seq_len << ") => "; +#endif + // Constant used in the kernel: 2**(step-stage). + int two_power = 1 << (step - stage); + + // Offload the work to kernel. + q.submit([&](handler &h) { + h.parallel_for(range<1>(size), [=](id<1> i) { + // Assign the bitonic sequence number. + int seq_num = i / seq_len; + + // Variable used to identified the swapped element. + int swapped_ele = -1; + + // Because the elements in the first half in the bitonic + // sequence may swap with elements in the second half, + // only the first half of elements in each sequence is + // required (seq_len/2). + int h_len = seq_len / 2; + + if (i < (seq_len * seq_num) + h_len) swapped_ele = i + h_len; + + // Check whether increasing or decreasing order. + int odd = seq_num / two_power; + + // Boolean variable used to determine "increasing" or + // "decreasing" order. + bool increasing = ((odd % 2) == 0); + + // Swap the elements in the bitonic sequence if needed + if (swapped_ele != -1) { + if (((a[i] > a[swapped_ele]) && increasing) || + ((a[i] < a[swapped_ele]) && !increasing)) { + int temp = a[i]; + a[i] = a[swapped_ele]; + a[swapped_ele] = temp; + } + } + }); + }); + q.wait(); + } // end stage + } // end step +} + +// Loop over the bitonic sequences at each stage in serial. +void SwapElements(int step, int stage, int num_sequence, int seq_len, + int *array) { + for (int seq_num = 0; seq_num < num_sequence; seq_num++) { + int odd = seq_num / (pow(2, (step - stage))); + bool increasing = ((odd % 2) == 0); + + int h_len = seq_len / 2; + + // For all elements in a bitonic sequence, swap them if needed + for (int i = seq_num * seq_len; i < seq_num * seq_len + h_len; i++) { + int swapped_ele = i + h_len; + + if (((array[i] > array[swapped_ele]) && increasing) || + ((array[i] < array[swapped_ele]) && !increasing)) { + int temp = array[i]; + array[i] = array[swapped_ele]; + array[swapped_ele] = temp; + } + } // end for all elements in a sequence + } // end all sequences +} + +// Function sorts an array in serial using bitonic sort algorithm. The size of +// the array is indicated by the exponent n: the array size is 2 ** n. +inline void BitonicSort(int a[], int n) { + // n: the exponent indicating the array size = 2 ** n. + + // step from 0, 1, 2, ...., n-1 + for (int step = 0; step < n; step++) { + // for each step s, stage goes s, s-1,..., 0 + for (int stage = step; stage >= 0; stage--) { + // Sequences (same size) are formed at each stage. + int num_sequence = pow(2, (n - stage - 1)); + // The length of the sequences (2, 4, ...). + int sequence_len = pow(2, stage + 1); + + SwapElements(step, stage, num_sequence, sequence_len, a); + } + } +} + +// Function showing the array. +void DisplayArray(int a[], int array_size) { + for (int i = 0; i < array_size; ++i) std::cout << a[i] << " "; + std::cout << "\n"; +} + +void Usage(std::string prog_name, int exponent) { + std::cout << " Incorrect parameters\n"; + std::cout << " Usage: " << prog_name << " n k \n\n"; + std::cout << " n: Integer exponent presenting the size of the input array. " + "The number of element in\n"; + std::cout << " the array must be power of 2 (e.g., 1, 2, 4, ...). Please " + "enter the corresponding\n"; + std::cout << " exponent betwwen 0 and " << exponent - 1 << ".\n"; + std::cout << " k: Seed used to generate a random sequence.\n"; +} + +int main(int argc, char *argv[]) { + int n, seed, size; + int exp_max = log2(std::numeric_limits::max()); + + // Read parameters. + try { + n = std::stoi(argv[1]); + + // Verify the boundary of acceptance. + if (n < 0 || n >= exp_max) { + Usage(argv[0], exp_max); + return -1; + } + + seed = std::stoi(argv[2]); + size = pow(2, n); + } catch (...) { + Usage(argv[0], exp_max); + return -1; + } + + std::cout << "\nArray size: " << size << ", seed: " << seed << "\n"; + + // Create queue on implementation-chosen default device. + queue q; + + std::cout << "Device: " << q.get_device().get_info() + << "\n"; + + // USM allocation using malloc_shared: data stores a sequence of random + // numbers. + int *data = malloc_shared(size, q); + + // Memory allocated for host access only. + int *data2 = (int *)malloc(size * sizeof(int)); + + // Initialize the array randomly using a seed. + srand(seed); + + for (int i = 0; i < size; i++) data[i] = data2[i] = rand() % 1000; + +#if DEBUG + std::cout << "\ndata before:\n"; + DisplayArray(data, size); +#endif + + // Start timer + dpc_common::TimeInterval t_par; + + ParallelBitonicSort(data, n, q); + + std::cout << "Kernel time: " << t_par.Elapsed() << " sec\n"; + +#if DEBUG + std::cout << "\ndata after sorting using parallel bitonic sort:\n"; + DisplayArray(data, size); +#endif + + // Start timer + dpc_common::TimeInterval t_ser; + + // Bitonic sort in CPU (serial) + BitonicSort(data2, n); + + std::cout << "CPU serial time: " << t_ser.Elapsed() << " sec\n"; + + // Verify both bitonic sort algorithms in kernel and in CPU. + bool pass = true; + for (int i = 0; i < size - 1; i++) { + // Validate the sequence order is increasing in both kernel and CPU. + if ((data[i] > data[i + 1]) || (data[i] != data2[i])) { + pass = false; + break; + } + } + + // Clean USM resources. + free(data, q); + + // Clean CPU memory. + free(data2); + + if (!pass) { + std::cout << "\nFailed!\n"; + return -2; + } + + std::cout << "\nSuccess!\n"; + return 0; +} From d3ab2190a761fe04d33139ebd8e041228ce28d2d Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Tue, 14 Jul 2020 11:43:03 -0700 Subject: [PATCH 2/9] Add a note about common file in README. Signed-off-by: Loc Nguyen --- DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md index 8066fd1f31..190b5874d0 100644 --- a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md @@ -58,6 +58,9 @@ This code sample is licensed under MIT license ## Building the `bitonic-sort` Program for CPU and GPU +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + ### Running Samples In DevCloud If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI From 8818264398babb35c72fd30fa44d2d68b6cbcbe4 Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Tue, 14 Jul 2020 14:16:39 -0700 Subject: [PATCH 3/9] Move 1d_HeatTransfer sample to open source GitHub. Signed-off-by: Loc Nguyen --- .../1d_HeatTransfer/1d_HeatTransfer.sln | 25 ++ .../1d_HeatTransfer/1d_HeatTransfer.vcxproj | 149 +++++++++ .../1d_HeatTransfer.vcxproj.filters | 22 ++ .../1d_HeatTransfer.vcxproj.user | 11 + .../1d_HeatTransfer/CMakeLists.txt | 27 ++ .../1d_HeatTransfer/License.txt | 7 + .../StructuredGrids/1d_HeatTransfer/README.md | 120 ++++++++ .../1d_HeatTransfer/sample.json | 29 ++ .../1d_HeatTransfer/src/1d_HeatTransfer.cpp | 288 ++++++++++++++++++ 9 files changed, 678 insertions(+) create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.sln create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj.filters create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj.user create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/CMakeLists.txt create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/License.txt create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/README.md create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/sample.json create mode 100644 DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.sln b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.sln new file mode 100644 index 0000000000..b127a6bb7d --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.1062 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "1d_HeatTransfer", "1d_HeatTransfer.vcxproj", "{D3AB428E-A631-4BA1-A526-9A05B7B8E8CE}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {D3AB428E-A631-4BA1-A526-9A05B7B8E8CE}.Debug|x64.ActiveCfg = Debug|x64 + {D3AB428E-A631-4BA1-A526-9A05B7B8E8CE}.Debug|x64.Build.0 = Debug|x64 + {D3AB428E-A631-4BA1-A526-9A05B7B8E8CE}.Release|x64.ActiveCfg = Release|x64 + {D3AB428E-A631-4BA1-A526-9A05B7B8E8CE}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {81F9E738-2753-4252-82FA-AE9A402B7ABD} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj new file mode 100644 index 0000000000..474bc4dd1e --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj @@ -0,0 +1,149 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + 15.0 + {d3ab428e-a631-4ba1-a526-9a05b7b8e8ce} + Win32Proj + _1d_HeatTransfer + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj.filters b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj.filters new file mode 100644 index 0000000000..8efae7e89b --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj.filters @@ -0,0 +1,22 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj.user b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj.user new file mode 100644 index 0000000000..1bfa967dac --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/1d_HeatTransfer.vcxproj.user @@ -0,0 +1,11 @@ + + + + 100 1000 + WindowsLocalDebugger + + + 100 1000 + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/CMakeLists.txt b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/CMakeLists.txt new file mode 100644 index 0000000000..5efeb945b3 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/CMakeLists.txt @@ -0,0 +1,27 @@ +# required cmake version +cmake_minimum_required(VERSION 3.5) + +# CMakeLists.txt for 1d_HeatTransfer project +project (1d_HeatTransfer) + +set(CMAKE_CXX_COMPILER "dpcpp") + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17") + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl") + +add_executable (1d_HeatTransfer src/1d_HeatTransfer.cpp) + +add_custom_target (run + COMMAND 1d_HeatTransfer 100 1000 + WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} +) + diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/License.txt b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/License.txt new file mode 100644 index 0000000000..0578223382 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/License.txt @@ -0,0 +1,7 @@ +Copyright 2019 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/README.md b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/README.md new file mode 100644 index 0000000000..6459b25e05 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/README.md @@ -0,0 +1,120 @@ +# 1D-Heat-Transfer Sample + +This code sample demonstrates the simulation of one dimensional heat transfer process using +Intel Data Parallel C++. Kernels in this example are implemented as a discretized differential +equation with second derivative in space and first derivative in time. + +For comprehensive instructions regarding DPC++ Programming, go to +https://software.intel.com/en-us/oneapi-programming-guide +and search based on relevant terms noted in the comments. + +| Optimized for | Description +|:--- |:--- +| OS | Linux Ubuntu 18.04 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) +| What you will learn | How to simulate 1D Heat Transfer using Intel DPC++ compiler +| Time to complete | 10 minutes + + +## Purpose + +1D-Heat-Transfer is a DPC++ application that simulates the heat propagation on a one-dimensional +isotropic and homogeneous medium. The following equation is used in the simulation of heat propagation: + +dU/dt = k * d2U/dx2 + +Where: +dU/dt is the rate of change of temperature at a point. +k is the thermal difusivity. +d2U/dx2 is the second spatial derivative. + +Or + +U(i) = C * (U(i+1) - 2 * U(i) + U(i-1)) + U(i) + +where constant C = k * dt / (dx * dx) + +The code sample includes both parallel and serial calculation of heat propagation. The code sample will +attempt first to execute on an available GPU and fallback to the system's CPU if a compatible GPU is +not detected. The results are stored in a file. + + +## Key Implementation Details + +The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command groups. + + +## License + +This code sample is licensed under MIT license + + +## Building the `1d_HeatTransfer` Program for CPU and GPU + +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system". + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, +FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI +Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System + 1. Build the program using the following `cmake` commands. + ``` + $ cd 1d_HeatTransfer + $ mkdir build + $ cd build + $ cmake .. + $ make -j + ``` + + 2. Run the program + + ``` + make run + ``` + + 3. Clean the program + + ``` + make clean + ``` + +### On a Windows* System + * Build the program using VS2017 or VS2019 + Right click on the solution file and open using either VS2017 or VS2019 IDE. + Right click on the project in Solution explorer and select Rebuild. + From top menu select Debug -> Start without Debugging. + + * Build the program using MSBuild + Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for + VS2019" + Run - MSBuild 1d_HeatTransfer.sln /t:Rebuild /p:Configuration="Release" + + +## Running the sample +### Application Parameters + + Usage: 1d_HeatTransfer + +where + +n is the number of points you want to simulate the heat transfer. + +i is the number of timesteps in the simulation. + +The sample offloads the computation to GPU and also performs a computation in serial in the CPU. +The results from the parallel and serial computation are compared and stored in an output file named "error_diff.txt". +If the results match, the application will display a “PASSED” message. + +### Example of Output +``` +$ ./1d_HeatTransfer 100 1000 +Number of points: 100 +Number of iterations: 1000 +Kernel runs on Intel(R) Gen9 HD Graphics NEO +Kernel time: 0.347854 sec +PASSED! There is no difference between the results computed in host and in kernel. +``` diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/sample.json b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/sample.json new file mode 100644 index 0000000000..4346f22435 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/sample.json @@ -0,0 +1,29 @@ +{ + "guid": "CDF82E46-91D4-46F6-B1E2-5C21443F41BC", + "name": "1d_HeatTransfer", + "categories": [ "Toolkit/Intel® oneAPI HPC Toolkit" ], + "description": "1D_HEAT_TRANSFER: Simulating 1D Heat Transfer problem Using Intel® oneAPI DPC++ Language", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild 1d_HeatTransfer.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "1d_HeatTransfer.exe 100 1000" + ] + }] + } +} diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp new file mode 100644 index 0000000000..83f3405ba3 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/src/1d_HeatTransfer.cpp @@ -0,0 +1,288 @@ +//============================================================== +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// 1D HEAT TRANSFER: Using Intel® oneAPI DPC++ Language to simulate 1D Heat +// Transfer. +// +// The code sample simulates the heat propagation according to the following +// equation (case where there is no heat generation): +// +// dU/dt = k * d2U/dx2 +// (u(x,t+DT) - u(x,t)) / DT = k * (u(x+DX,t)- 2u(x,t) + u(x-DX,t)) / DX2 +// U(i) = C * (U(i+1) - 2 * U(i) + U(i-1)) + U(i) +// +// where constant C = k * dt / (dx * dx) +// +// For comprehensive instructions regarding DPC++ Programming, go to +// https://software.intel.com/en-us/oneapi-programming-guide +// and search based on relevant terms noted in the comments. +// +// DPC++ material used in this code sample: +// +// Basic structures of DPC++: +// DPC++ Queues (including device selectors and exception handlers) +// DPC++ Buffers and accessors (communicate data between the host and the +// device) +// DPC++ Kernels (including parallel_for function and range<1> objects) +// +//****************************************************************************** +// Content: (version 1.1) +// 1d_HeatTransfer.cpp +// +//****************************************************************************** +#include +#include +#include +#include +#include + +using namespace sycl; + +constexpr float dt = 0.002f; +constexpr float dx = 0.01f; +constexpr float k = 0.025f; +constexpr float temp = 100.0f; // Initial temperature. + +//************************************ +// Function description: display input parameters used for this sample. +//************************************ +void Usage(std::string programName) { + std::cout << " Incorrect parameters \n"; + std::cout << " Usage: "; + std::cout << programName << " \n\n"; + std::cout << " n : Number of points to simulate \n"; + std::cout << " i : Number of timesteps \n"; +} + +//************************************ +// Function description: initialize the array. +//************************************ +void Initialize(float* array, unsigned int num, unsigned int idx) { + for (unsigned int i = idx; i < num; i++) array[i] = 0.0f; +} + +//************************************ +// Function description: compute the heat in the device (in parallel). +//************************************ +float* ComputeHeatDeviceParallel(float* arr, float* arr_next, float C, + unsigned int num_p, unsigned int num_iter, + float temp) { + unsigned int i; + + try { + // Define the device queue + queue q = default_selector{}; + std::cout << "Kernel runs on " + << q.get_device().get_info() << "\n"; + + // Set boundary condition at one end. + arr[0] = arr_next[0] = temp; + + float* current_data_ptr = arr; + float* next_data_ptr = arr_next; + // current_data_ptr = arr; + // next_data_ptr = arr_next; + + // Buffer scope + { + buffer arr_buf(current_data_ptr, range<1>{num_p + 2}); + buffer arr_next_buf(next_data_ptr, range<1>{num_p + 2}); + + // Iterate over timesteps + for (i = 1; i <= num_iter; i++) { + if (i % 2 != 0) { + q.submit([&](handler& h) { + // The size of memory amount that will be given to the buffer. + range<1> num_items{num_p + 2}; + + auto arr_acc = arr_buf.get_access(h); + auto arr_next_acc = + arr_next_buf.get_access(h); + + h.parallel_for(num_items, [=](id<1> k) { + size_t gid = k.get(0); + + if (gid == 0) { + } else if (gid == num_p + 1) { + arr_next_acc[k] = arr_acc[k - 1]; + } else { + arr_next_acc[k] = + C * (arr_acc[k + 1] - 2 * arr_acc[k] + arr_acc[k - 1]) + + arr_acc[k]; + } + }); // end parallel for loop in kernel1 + }); // end device queue + + } else { + q.submit([&](handler& h) { + // The size of memory amount that will be given to the buffer. + range<1> num_items{num_p + 2}; + + auto arr_acc = arr_buf.get_access(h); + auto arr_next_acc = + arr_next_buf.get_access(h); + + h.parallel_for(num_items, [=](id<1> k) { + size_t gid = k.get(0); + + if (gid == 0) { + } else if (gid == num_p + 1) { + arr_acc[k] = arr_next_acc[k - 1]; + } else { + arr_acc[k] = C * (arr_next_acc[k + 1] - 2 * arr_next_acc[k] + + arr_next_acc[k - 1]) + + arr_next_acc[k]; + } + }); // end parallel for loop in kernel2 + }); // end device queue + } // end if %2 + } // end iteration + } // end buffer scope + + q.wait_and_throw(); + + } catch (cl::sycl::exception e) { + std::cout << "SYCL exception caught: " << e.what() << "\n"; + } + + if (i % 2 != 0) + return arr; + else + return arr_next; +} + +//************************************ +// Function description: compute the heat in the host (in serial). +//************************************ +float* ComputeHeatHostSerial(float* arr, float* arr_next, float C, + unsigned int num_p, unsigned int num_iter, + float temp) { + unsigned int i, k; + float* swap; + + // Set initial condition + Initialize(arr, num_p + 2, 0); + Initialize(arr_next, num_p + 2, 0); + + // Set boundary condition at one end. + arr[0] = arr_next[0] = temp; + + // Iterate over timesteps + for (i = 1; i <= num_iter; i++) { + for (k = 1; k <= num_p; k++) + arr_next[k] = C * (arr[k + 1] - 2 * arr[k] + arr[k - 1]) + arr[k]; + + arr_next[num_p + 1] = arr[num_p]; + + // Swap the buffers at every iteration. + swap = arr; + arr = arr_next; + arr_next = swap; + } + + return arr; +} + +//************************************ +// Function description: calculate the results computed by the host and by the +// device. +//************************************ +bool CompareResults(float* device_results, float* host_results, + unsigned int num_point, float C) { + float delta = 0.001f; + float difference = 0.00f; + double norm2 = 0; + bool err = false; + + std::ofstream err_file; + err_file.open("error_diff.txt"); + + err_file << " \t idx\theat[i]\t\theat_CPU[i] \n"; + + for (unsigned int i = 0; i < num_point + 2; i++) { + err_file << "\n RESULT: " << i << "\t" << std::setw(12) << std::left + << device_results[i] << "\t" << host_results[i]; + + difference = fabsf(host_results[i] - device_results[i]); + norm2 += difference * difference; + + if (difference > delta) { + err = true; + err_file << ", diff: " << difference; + } + } + + return err; +} + +int main(int argc, char* argv[]) { + unsigned int n_point; // The number of point in 1D space + unsigned int + n_iteration; // The number of iteration to simulate the heat propagation + + // Read input parameters + try { + n_point = std::stoi(argv[1]); + n_iteration = std::stoi(argv[2]); + + } catch (...) { + Usage(argv[0]); + return (-1); + } + + std::cout << "Number of points: " << n_point << "\n"; + std::cout << "Number of iterations: " << n_iteration << "\n"; + + // Array heat and heat_next arrays store temperatures of the current and next + // iteration of n_point (calculated in kernel) + float* heat = new float[n_point + 2]; + float* heat_next = new float[n_point + 2]; + + // heat_CPU and heat_next_CPU store temperatures of the current and next + // iteration of n_point (calculated in CPU or comparison) + float* heat_CPU = new float[n_point + 2]; + float* heat_CPU_next = new float[n_point + 2]; + + // Constant used in the simulation + float C = (k * dt) / (dx * dx); + + // Heat initial condition at t = 0 + Initialize(heat, n_point + 2, 0); + Initialize(heat_next, n_point + 2, 0); + + // Start timer + dpc_common::TimeInterval t_par; + + float* final_device = + ComputeHeatDeviceParallel(heat, heat_next, C, n_point, n_iteration, temp); + + // Display time used by device + std::cout << "Kernel time: " << t_par.Elapsed() << " sec\n"; + + // Compute heat in CPU in (for comparision) + float* final_CPU = NULL; + + final_CPU = ComputeHeatHostSerial(heat_CPU, heat_CPU_next, C, n_point, + n_iteration, temp); + + // Compare the results computed in device (in parallel) and in host (in + // serial) + bool err = CompareResults(final_device, final_CPU, n_point, C); + + if (err == true) + std::cout << "Please check the error_diff.txt file ...\n"; + else + std::cout << "PASSED! There is no difference between the results computed " + "in host and in kernel.\n"; + + // Cleanup + delete[] heat; + delete[] heat_next; + delete[] heat_CPU; + delete[] heat_CPU_next; + + return 0; +} From 306e10bd00ab743945fd76c6dd0e6b4e5c54f815 Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Fri, 14 Aug 2020 10:15:31 -0700 Subject: [PATCH 4/9] Updating License file to remove date --- DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt index 6e9524bd74..415025cf03 100644 --- a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt @@ -1,4 +1,4 @@ -Copyright 2020 Intel Corporation +Copyright Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: From d8b1c57c7cb33a22a669a2a9a01d6f9853eb93fb Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Fri, 14 Aug 2020 10:27:06 -0700 Subject: [PATCH 5/9] Adding Buffer Object approach. --- .../GraphTraversal/bitonic-sort/README.md | 9 +- .../bitonic-sort/src/bitonic-sort.cpp | 138 ++++++++++++++---- 2 files changed, 114 insertions(+), 33 deletions(-) diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md index 061f753ed0..0777dc2c0b 100644 --- a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md @@ -51,7 +51,7 @@ if a compatible GPU is not detected. ## Key Implementation Details The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command g -roups. Unified Shared Memory (USM) is used for data management. +roups. Unified Shared Memory (USM) and Buffer Object are used for data management. ## License This code sample is licensed under MIT license @@ -117,7 +117,10 @@ the ascending order is verified, the application will display a “Success!” m $ ./bitonic-sort 21 47 Array size: 2097152, seed: 47 Device: Intel(R) Gen9 HD Graphics NEO -Kernel time: 0.416827 sec -CPU serial time: 0.60523 sec +Warm up ... +Kernel time using USM: 0.248422 sec +Kernel time using buffer allocation: 0.253364 sec +CPU serial time: 0.628803 sec + Success! ``` diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp index e0e4312520..07dad3e0f0 100644 --- a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp @@ -35,38 +35,90 @@ // data to the kernel. The kernel swaps the elements accordingly in parallel. // #include -#include #include +#include "dpc_common.hpp" using namespace sycl; using namespace std; -void ParallelBitonicSort(int a[], int n, queue &q) { +#define DEBUG 0 + +void ParallelBitonicSort(int data_gpu[], int n, queue &q) { + // n: the exponent used to set the array size. Array size = power(2, n) + int size = pow(2, n); + int* a = data_gpu; + + // step from 0, 1, 2, ...., n-1 + for (int step = 0; step < n; step++) { + // for each step s, stage goes s, s-1, ..., 0 + for (int stage = step; stage >= 0; stage--) { + int seq_len = pow(2, stage + 1); + + // Constant used in the kernel: 2**(step-stage). + int two_power = 1 << (step - stage); + + // Offload the work to kernel. + q.submit([&](handler &h) { + h.parallel_for(range<1>(size), [=](id<1> i) { + // Assign the bitonic sequence number. + int seq_num = i / seq_len; + + // Variable used to identified the swapped element. + int swapped_ele = -1; + + // Because the elements in the first half in the bitonic + // sequence may swap with elements in the second half, + // only the first half of elements in each sequence is + // required (seq_len/2). + int h_len = seq_len / 2; + + if (i < (seq_len * seq_num) + h_len) swapped_ele = i + h_len; + + // Check whether increasing or decreasing order. + int odd = seq_num / two_power; + + // Boolean variable used to determine "increasing" or + // "decreasing" order. + bool increasing = ((odd % 2) == 0); + + // Swap the elements in the bitonic sequence if needed + if (swapped_ele != -1) { + if (((a[i] > a[swapped_ele]) && increasing) || + ((a[i] < a[swapped_ele]) && !increasing)) { + int temp = a[i]; + a[i] = a[swapped_ele]; + a[swapped_ele] = temp; + } + } + }); + }); + q.wait(); + } // end stage + } // end step +} + +void ParallelBitonicSortBuffer(int data_gpu[], int n, queue &q) { // n: the exponent used to set the array size. Array size = power(2, n) int size = pow(2, n); + buffer input (data_gpu, size); + // step from 0, 1, 2, ...., n-1 for (int step = 0; step < n; step++) { // for each step s, stage goes s, s-1, ..., 0 for (int stage = step; stage >= 0; stage--) { - // In each state, construct a number (num_seq) of bitonic sequences of - // size seq_len (2, 4, ...) num_seq stores the number of bitonic sequences - // at each stage. seq_len stores the length of the bitonic sequence at - // each stage. int seq_len = pow(2, stage + 1); -#if DEBUG - int num_seq = pow(2, (n - stage - 1)); // Used for debug purpose. - std::cout << "step num:" << step << " stage num:" << stage - << " num_seq:" << num_seq << "(" << seq_len << ") => "; -#endif + // Constant used in the kernel: 2**(step-stage). int two_power = 1 << (step - stage); // Offload the work to kernel. q.submit([&](handler &h) { - h.parallel_for(range<1>(size), [=](id<1> i) { + auto a = input.get_access(h); + + h.parallel_for(range<1>(size), [=](id<1> i) { // Assign the bitonic sequence number. - int seq_num = i / seq_len; + int seq_num = i / seq_len; // Variable used to identified the swapped element. int swapped_ele = -1; @@ -190,40 +242,62 @@ int main(int argc, char *argv[]) { std::cout << "Device: " << q.get_device().get_info() << "\n"; + // Memory allocated for host access only. + int *data_cpu = (int *)malloc(size * sizeof(int)); + // USM allocation using malloc_shared: data stores a sequence of random // numbers. - int *data = malloc_shared(size, q); + int *data_usm = malloc_shared(size, q); - // Memory allocated for host access only. - int *data2 = (int *)malloc(size * sizeof(int)); + // Memory allocated to store gpu results using buffer allocation + int *data_gpu = (int *)malloc(size * sizeof(int)); // Initialize the array randomly using a seed. srand(seed); - for (int i = 0; i < size; i++) data[i] = data2[i] = rand() % 1000; + for (int i = 0; i < size; i++) + data_usm[i] = data_gpu[i] = data_cpu[i] = rand() % 1000; #if DEBUG std::cout << "\ndata before:\n"; - DisplayArray(data, size); + DisplayArray(data_usm, size); #endif + // Warm up + std::cout << "Warm up ...\n"; + ParallelBitonicSort(data_usm, n, q); + // Start timer dpc_common::TimeInterval t_par; - ParallelBitonicSort(data, n, q); + // Parallel sort using USM + ParallelBitonicSort(data_usm, n, q); - std::cout << "Kernel time: " << t_par.Elapsed() << " sec\n"; + std::cout << "Kernel time using USM: " << t_par.Elapsed() << " sec\n"; #if DEBUG - std::cout << "\ndata after sorting using parallel bitonic sort:\n"; - DisplayArray(data, size); + std::cout << "\ndata_usm after sorting using parallel bitonic sort:\n"; + DisplayArray(data_usm, size); #endif + // Start timer + dpc_common::TimeInterval t_par2; + + // Parallel sort using buffer allocation + ParallelBitonicSortBuffer(data_gpu, n, q); + + std::cout << "Kernel time using buffer allocation: " << t_par2.Elapsed() << " sec\n"; + +#if DEBUG + std::cout << "\ndata_gpu after sorting using parallel bitonic sort:\n"; + DisplayArray(data_gpu, size); +#endif + // Start timer dpc_common::TimeInterval t_ser; // Bitonic sort in CPU (serial) - BitonicSort(data2, n); + BitonicSort(data_cpu, n); std::cout << "CPU serial time: " << t_ser.Elapsed() << " sec\n"; @@ -231,18 +305,22 @@ int main(int argc, char *argv[]) { bool pass = true; for (int i = 0; i < size - 1; i++) { // Validate the sequence order is increasing in both kernel and CPU. - if ((data[i] > data[i + 1]) || (data[i] != data2[i])) { + if ((data_usm[i] > data_usm[i + 1]) || (data_usm[i] != data_cpu[i])) { pass = false; break; } + + if ((data_gpu[i] > data_gpu[i + 1]) || (data_gpu[i] != data_cpu[i])) { + pass = false; + break; + } } - // Clean USM resources. - free(data, q); - - // Clean CPU memory. - free(data2); - + // Clean resources. + free(data_cpu); + free(data_usm, q); + free(data_gpu); + if (!pass) { std::cout << "\nFailed!\n"; return -2; From 74e5ec0d90cca66e0ba5228d90f6c5a63fb8cb16 Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Fri, 14 Aug 2020 13:34:27 -0700 Subject: [PATCH 6/9] Add comment about the location of dpc_common.hpp. --- .../DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp index 07dad3e0f0..0153bf4cd1 100644 --- a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp @@ -36,6 +36,9 @@ // #include #include + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp #include "dpc_common.hpp" using namespace sycl; From e2e8d014d73ef99684c717bdb0ee55347715aea6 Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Fri, 14 Aug 2020 14:04:43 -0700 Subject: [PATCH 7/9] New sample: Prefix Sum. --- .../ParallelPatterns/PrefixSum/CMakeLists.txt | 30 +++ .../ParallelPatterns/PrefixSum/License.txt | 7 + .../ParallelPatterns/PrefixSum/PrefixSum.sln | 25 ++ .../PrefixSum/PrefixSum.vcxproj | 137 ++++++++++ .../PrefixSum/PrefixSum.vcxproj.filters | 22 ++ .../PrefixSum/PrefixSum.vcxproj.user | 11 + .../ParallelPatterns/PrefixSum/README.md | 124 +++++++++ .../ParallelPatterns/PrefixSum/sample.json | 29 +++ .../PrefixSum/src/PrefixSum.cpp | 239 ++++++++++++++++++ 9 files changed, 624 insertions(+) create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt new file mode 100644 index 0000000000..85fcec4963 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt @@ -0,0 +1,30 @@ +# required cmake version +cmake_minimum_required(VERSION 3.5) + +project (PrefixSum) + +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp") +else() + set(CMAKE_CXX_COMPILER "dpcpp") +endif() + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17") + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl") + +add_executable (PrefixSum src/PrefixSum.cpp) + +add_custom_target (run + COMMAND PrefixSum 21 47 + WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} +) + diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt new file mode 100644 index 0000000000..415025cf03 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln new file mode 100644 index 0000000000..3587a92e74 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.29926.136 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "PrefixSum", "PrefixSum.vcxproj", "{BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.ActiveCfg = Debug|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.Build.0 = Debug|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.ActiveCfg = Release|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {9B9594EB-112B-4FAE-AD1F-04BD8FF34B9F} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj new file mode 100644 index 0000000000..6a6309b96b --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj @@ -0,0 +1,137 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + 15.0 + {bc12abe6-7951-47d6-93dc-126f8a5fcfd2} + Win32Proj + PrefixSum + 10.0.17763.0 + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + + + + + Console + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include + + + Console + true + + + + + + + + + + + Console + true + true + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters new file mode 100644 index 0000000000..2003dce0f2 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters @@ -0,0 +1,22 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user new file mode 100644 index 0000000000..7288fa06dd --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user @@ -0,0 +1,11 @@ + + + + 21 47 + WindowsLocalDebugger + + + 21 47 + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md new file mode 100644 index 0000000000..6bbc2cfdfb --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md @@ -0,0 +1,124 @@ +# `Prefix Sum` sample + +This code sample demonstrates the implementation of parallel prefix sum using Intel Data Parallel C++ to +offload the computation to a GPU. In this implementation, a random sequence of 2**n elements is given +(n is a positive number) as input, the algorithm compute the prefix sum in parallel. The result sequence is +in ascending order. + +For comprehensive instructions regarding DPC++ Programming, go to +https://software.intel.com/en-us/oneapi-programming-guide +and search based on relevant terms noted in the comments. + +| Optimized for | Description +|:--- |:--- +| OS | Linux Ubuntu 18.04 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) +| What you will learn | Implement bitonic sort using Intel DPC++ compiler +| Time to complete | 15 minutes + + +## Purpose + +Given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and returns +a new sequence y0, y1, y2, ..., yn so that + +y0 = x0 +y1 = x0 + x1 +y2 = x0 + x1 + x2 +..... +yn = x0 + x1 + x2 + ... + xn + +Below is the pseudo code for computing prefix sum in parallel: + +n is power of 2 (1, 2, 4 , 8, 16, ...): + +for i from 0 to [log2 n] - 1 do + for j from 0 to (n-1) do in parallel + if j<2^i then + x_{j}^{i+1} <- x_{j}^{i}} + else + x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}} + +In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i. +Given n processors to perform each iteration of the inner loop in constant time, the algorithm +as a whole runs in O(log n) time, the number of iterations of the outer loop. + +The code will attempt first to execute on an available GPU and fallback to the system's CPU if a +compatible GPU is not detected. + +## Key Implementation Details + +The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command +groups. + +## License +This code sample is licensed under MIT license + +## Building the `PrefixSum` Program for CPU and GPU + +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, +FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI +Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System +1. Build the program using the following `cmake` commands. + ``` + $ cd PrefixSum + $ mkdir build + $ cd build + $ cmake .. + $ make + ``` + +2. Run the program: + ``` + make run + ``` + +3. Clean the program using: + ``` + make clean + ``` + +### On a Windows* System + * Build the program using VS2017 or VS2019 + Right click on the solution file and open using either VS2017 or VS2019 IDE. + Right click on the project in Solution explorer and select Rebuild. + From top menu select Debug -> Start without Debugging. + + * Build the program using MSBuild + Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for + VS2019" + Run - MSBuild PrefixSum.sln /t:Rebuild /p:Configuration="Release" + +## Running the sample +### Application Parameters + + Usage: PrefixSum + +where + +exponent is a positive number. The according length of the sequence is 2**exponent. + +seed is the seed used by the random generator to generate the randomness. + +The sample offloads the computation to GPU and then performs the verification the results in the CPU. +The results are verified if yk = yk-1 + xk the original compared. If the results are matched and +the ascending order is verified, the application will display a “Success!” message. + +### Example of Output +``` +$ ./PrefixSum 21 47 + +Sequence size: 2097152, seed: 47 +Num iteration: 21 +Device: Intel(R) Gen9 HD Graphics NEO +Kernel time: 170 ms + +Success! +``` diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json new file mode 100644 index 0000000000..def268a2f8 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json @@ -0,0 +1,29 @@ +{ + "guid": "5D274319-02EE-44B0-B055-71E4C50D05E0", + "name": "PrefixSum", + "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ], + "description": "Compute Prefix Sum using Intel® oneAPI DPC++ Language", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild PrefixSum.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "PrefixSum.exe 21 47" + ] + }] + } +} diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp new file mode 100644 index 0000000000..b2af8367a7 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp @@ -0,0 +1,239 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// PrefixSum: this code sample implements the inclusive scan (prefix sum) in parallel. That +// is, given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and +// returns a new sequence y0, y1, y2, ..., yn so that +// +// y0 = x0 +// y1 = x0 + x1 +// y2 = x0 + x1 + x2 +// ..... +// yn = x0 + x1 + x2 + ... + xn +// +// Below is the pseudo code for computing prefix sum in parallel: +// +// n is power of 2 (1, 2, 4 , 8, 16, ...): +// +// for i from 0 to [log2 n] - 1 do +// for j from 0 to (n-1) do in parallel +// if j<2^i then +// x_{j}^{i+1} <- x_{j}^{i}} +// else +// x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}} +// +// In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i. +// Given n processors to perform each iteration of the inner loop in constant time, the algorithm as +// a whole runs in O(log n) time, the number of iterations of the outer loop. +// + +#include + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp +#include "dpc_common.hpp" + +using namespace sycl; +using namespace std; + +void Show(int a[], int arraysize) +{ + for (int i = 0; i < arraysize; ++i) + { + std::cout << a[i] << " "; + if ((i % 16) == 15) std::cout << "\n"; + } + + std::cout << "\n"; + return; +} + +int* ParallelPrefixSum(int* prefix1, int* prefix2, unsigned int nb, queue &q) +{ + unsigned int two_power = 1; + unsigned int num_iter = log2(nb); + //unsigned int uintmax = UINT_MAX; + int* result = NULL; + + // std::cout << "uintmax " << uintmax << " " << log2(uintmax) << "\n"; + // Buffer scope + { + buffer prefix1_buf(prefix1, range<1>{nb}); + buffer prefix2_buf(prefix2, range<1>{nb}); + + // Iterate over the necessary iterations. + for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) { + + // Submit command group for execution + q.submit([&](handler& h) { + // Create accessors + auto prefix1_acc = prefix1_buf.get_access(h); + auto prefix2_acc = prefix2_buf.get_access(h); + + if (iter % 2 == 0) { + h.parallel_for(range<1>(nb), [=](id<1> j) { + if (j < two_power) { + prefix2_acc[j] = prefix1_acc[j]; + } + else { + prefix2_acc[j] = prefix1_acc[j] + prefix1_acc[j - two_power]; + } + }); // end parallel for loop in kernel + result = prefix2; + //std::cout << "return prefix2\n"; + } + else { + h.parallel_for(range<1>(nb), [=](id<1> j) { + if (j < two_power) { + prefix1_acc[j] = prefix2_acc[j]; + } + else { + prefix1_acc[j] = prefix2_acc[j] + prefix2_acc[j - two_power]; + } + }); // end parallel for loop in kernel + result = prefix1; + //std::cout << "return prefix1\n"; + } + }); // end device queue + } // end iteration + } // Buffer scope + + // Wait for commands to complete. Enforce synchronization on the command queue + q.wait_and_throw(); + + return result; +} +/* +void PrefixSum(int* x, unsigned int nb) +{ + unsigned int two_power = 1; + unsigned int num_iter = log2(nb); + int temp = 0; + + // Iterate over the necessary iterations + for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) { + //Show(x, nb); + // std::cout << "two_power: " << two_power << "\n"; + for (unsigned int j = nb; j > 0; j--) { + if (j < two_power) { + x[j] = x[j]; + } + else { + x[j] = x[j] + x[j - two_power]; + } + } + } +} +*/ +void Usage(std::string prog_name, int exponent) { + std::cout << " Incorrect parameters\n"; + std::cout << " Usage: " << prog_name << " n k \n\n"; + std::cout << " n: Integer exponent presenting the size of the input array. The number of el\ +ement in\n"; + std::cout << " the array must be power of 2 (e.g., 1, 2, 4, ...). Please enter the corre\ +sponding\n"; + std::cout << " exponent betwwen 0 and " << exponent - 1 << ".\n"; + std::cout << " k: Seed used to generate a random sequence.\n"; +} + +int main(int argc, char* argv[]) { + unsigned int nb, seed; + int n, exp_max = log2(std::numeric_limits::max()); + + // Read parameters. + try { + n = std::stoi(argv[1]); + + // Verify the boundary of acceptance. + if (n < 0 || n >= exp_max) { + Usage(argv[0], exp_max); + return -1; + } + + seed = std::stoi(argv[2]); + nb = pow(2, n); + } catch (...) { + Usage(argv[0], exp_max); + return -1; + } + + std::cout << "\nSequence size: " << nb << ", seed: " << seed; + + int num_iter = log2(nb); + std::cout << "\nNum iteration: " << num_iter << "\n"; + + // Define device selector as 'default' + default_selector device_selector; + + // exception handler + auto exception_handler = [](exception_list exceptionList) { + for (std::exception_ptr const& e : exceptionList) { + try { + std::rethrow_exception(e); + } catch (cl::sycl::exception const& e) { + std::terminate(); + } + } + }; + + // Create a device queue using DPC++ class queue + queue q(device_selector, exception_handler); + + std::cout << "Device: " << q.get_device().get_info() << "\n"; + + int *data = new int[nb]; + int *prefix_sum1 = new int[nb]; + int *prefix_sum2 = new int[nb]; + int *result = NULL; + + srand(seed); + + // Initialize data arrays + for (int i = 0; i < nb; i++) { + data[i] = prefix_sum1[i] = rand() % 10; + prefix_sum2[i] = 0; + } + + // Start timer + auto start = std::chrono::steady_clock::now(); + + result = ParallelPrefixSum(prefix_sum1, prefix_sum2, nb, q); + + auto end = std::chrono::steady_clock::now(); + auto timeKern = std::chrono::duration_cast(end - start).count(); + std::cout << "Kernel time: " << timeKern << " ms" << "\n"; + + //std::cout << "\ndata after transforming using parallel prefix sum result:"; + //Show(result, nb); + + bool equal = true; + + if (result[0] != data[0]) + equal = false; + else + { + for (int i = 1; i < nb; i++) { + if (result[i] != result[i - 1] + data[i]) + { + equal = false; + break; + } + } + } + + delete[] data; + delete[] prefix_sum1; + delete[] prefix_sum2; + + if (!equal) { + std::cout << "\nFailed: " << std::endl; + return -2; + } + else { + std::cout << "\nSuccess!" << std::endl; + return 0; + } +} From 4f4a320e291d605fab1126efa3a03bcf1593374e Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Fri, 14 Aug 2020 14:16:22 -0700 Subject: [PATCH 8/9] Remove new sample. --- .../ParallelPatterns/PrefixSum/CMakeLists.txt | 30 --- .../ParallelPatterns/PrefixSum/License.txt | 7 - .../ParallelPatterns/PrefixSum/PrefixSum.sln | 25 -- .../PrefixSum/PrefixSum.vcxproj | 137 ---------- .../PrefixSum/PrefixSum.vcxproj.filters | 22 -- .../PrefixSum/PrefixSum.vcxproj.user | 11 - .../ParallelPatterns/PrefixSum/README.md | 124 --------- .../ParallelPatterns/PrefixSum/sample.json | 29 --- .../PrefixSum/src/PrefixSum.cpp | 239 ------------------ 9 files changed, 624 deletions(-) delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json delete mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt deleted file mode 100644 index 85fcec4963..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt +++ /dev/null @@ -1,30 +0,0 @@ -# required cmake version -cmake_minimum_required(VERSION 3.5) - -project (PrefixSum) - -if(WIN32) - set(CMAKE_CXX_COMPILER "dpcpp") -else() - set(CMAKE_CXX_COMPILER "dpcpp") -endif() - -# Set default build type to RelWithDebInfo if not specified -if (NOT CMAKE_BUILD_TYPE) - message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") - set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE - STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" - FORCE) -endif() - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17") - -set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl") - -add_executable (PrefixSum src/PrefixSum.cpp) - -add_custom_target (run - COMMAND PrefixSum 21 47 - WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} -) - diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt deleted file mode 100644 index 415025cf03..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt +++ /dev/null @@ -1,7 +0,0 @@ -Copyright Intel Corporation - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln deleted file mode 100644 index 3587a92e74..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln +++ /dev/null @@ -1,25 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 16 -VisualStudioVersion = 16.0.29926.136 -MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "PrefixSum", "PrefixSum.vcxproj", "{BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|x64 = Debug|x64 - Release|x64 = Release|x64 - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.ActiveCfg = Debug|x64 - {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.Build.0 = Debug|x64 - {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.ActiveCfg = Release|x64 - {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.Build.0 = Release|x64 - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection - GlobalSection(ExtensibilityGlobals) = postSolution - SolutionGuid = {9B9594EB-112B-4FAE-AD1F-04BD8FF34B9F} - EndGlobalSection -EndGlobal diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj deleted file mode 100644 index 6a6309b96b..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj +++ /dev/null @@ -1,137 +0,0 @@ - - - - - Debug - x64 - - - Release - x64 - - - - - - - 15.0 - {bc12abe6-7951-47d6-93dc-126f8a5fcfd2} - Win32Proj - PrefixSum - 10.0.17763.0 - - - - Application - true - Intel(R) oneAPI DPC++ Compiler - Unicode - - - Application - false - Intel(R) oneAPI DPC++ Compiler - true - Unicode - - - Application - true - Intel(R) oneAPI DPC++ Compiler - Unicode - - - Application - false - Intel(R) oneAPI DPC++ Compiler - true - Unicode - - - - - - - - - - - - - - - - - - - - - true - - - true - - - false - - - false - - - - - - - - - - Console - true - - - - - - - - - %ONEAPI_ROOT%\dev-utilities\latest\include - - - Console - true - - - - - - - - - - - Console - true - true - true - - - - - - - - - %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) - - - Console - true - true - true - - - - - - \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters deleted file mode 100644 index 2003dce0f2..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters +++ /dev/null @@ -1,22 +0,0 @@ - - - - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hh;hpp;hxx;hm;inl;inc;ipp;xsd - - - {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} - rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms - - - - - Source Files - - - \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user deleted file mode 100644 index 7288fa06dd..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user +++ /dev/null @@ -1,11 +0,0 @@ - - - - 21 47 - WindowsLocalDebugger - - - 21 47 - WindowsLocalDebugger - - \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md deleted file mode 100644 index 6bbc2cfdfb..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md +++ /dev/null @@ -1,124 +0,0 @@ -# `Prefix Sum` sample - -This code sample demonstrates the implementation of parallel prefix sum using Intel Data Parallel C++ to -offload the computation to a GPU. In this implementation, a random sequence of 2**n elements is given -(n is a positive number) as input, the algorithm compute the prefix sum in parallel. The result sequence is -in ascending order. - -For comprehensive instructions regarding DPC++ Programming, go to -https://software.intel.com/en-us/oneapi-programming-guide -and search based on relevant terms noted in the comments. - -| Optimized for | Description -|:--- |:--- -| OS | Linux Ubuntu 18.04 -| Hardware | Skylake with GEN9 or newer -| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) -| What you will learn | Implement bitonic sort using Intel DPC++ compiler -| Time to complete | 15 minutes - - -## Purpose - -Given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and returns -a new sequence y0, y1, y2, ..., yn so that - -y0 = x0 -y1 = x0 + x1 -y2 = x0 + x1 + x2 -..... -yn = x0 + x1 + x2 + ... + xn - -Below is the pseudo code for computing prefix sum in parallel: - -n is power of 2 (1, 2, 4 , 8, 16, ...): - -for i from 0 to [log2 n] - 1 do - for j from 0 to (n-1) do in parallel - if j<2^i then - x_{j}^{i+1} <- x_{j}^{i}} - else - x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}} - -In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i. -Given n processors to perform each iteration of the inner loop in constant time, the algorithm -as a whole runs in O(log n) time, the number of iterations of the outer loop. - -The code will attempt first to execute on an available GPU and fallback to the system's CPU if a -compatible GPU is not detected. - -## Key Implementation Details - -The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command -groups. - -## License -This code sample is licensed under MIT license - -## Building the `PrefixSum` Program for CPU and GPU - -### Include Files -The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. - -### Running Samples In DevCloud -If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, -FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI -Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) - -### On a Linux* System -1. Build the program using the following `cmake` commands. - ``` - $ cd PrefixSum - $ mkdir build - $ cd build - $ cmake .. - $ make - ``` - -2. Run the program: - ``` - make run - ``` - -3. Clean the program using: - ``` - make clean - ``` - -### On a Windows* System - * Build the program using VS2017 or VS2019 - Right click on the solution file and open using either VS2017 or VS2019 IDE. - Right click on the project in Solution explorer and select Rebuild. - From top menu select Debug -> Start without Debugging. - - * Build the program using MSBuild - Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for - VS2019" - Run - MSBuild PrefixSum.sln /t:Rebuild /p:Configuration="Release" - -## Running the sample -### Application Parameters - - Usage: PrefixSum - -where - -exponent is a positive number. The according length of the sequence is 2**exponent. - -seed is the seed used by the random generator to generate the randomness. - -The sample offloads the computation to GPU and then performs the verification the results in the CPU. -The results are verified if yk = yk-1 + xk the original compared. If the results are matched and -the ascending order is verified, the application will display a “Success!” message. - -### Example of Output -``` -$ ./PrefixSum 21 47 - -Sequence size: 2097152, seed: 47 -Num iteration: 21 -Device: Intel(R) Gen9 HD Graphics NEO -Kernel time: 170 ms - -Success! -``` diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json deleted file mode 100644 index def268a2f8..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "guid": "5D274319-02EE-44B0-B055-71E4C50D05E0", - "name": "PrefixSum", - "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ], - "description": "Compute Prefix Sum using Intel® oneAPI DPC++ Language", - "toolchain": [ "dpcpp" ], - "targetDevice": [ "CPU", "GPU" ], - "languages": [ { "cpp": {} } ], - "os": [ "linux", "windows" ], - "builder": [ "ide", "cmake" ], - "ciTests": { - "linux": [{ - "steps": [ - "mkdir build", - "cd build", - "cmake ..", - "make", - "make run" - ] - }], - "windows": [{ - "steps": [ - "MSBuild PrefixSum.sln /t:Rebuild /p:Configuration=\"Release\"", - "cd x64/Release", - "PrefixSum.exe 21 47" - ] - }] - } -} diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp deleted file mode 100644 index b2af8367a7..0000000000 --- a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp +++ /dev/null @@ -1,239 +0,0 @@ -//============================================================== -// Copyright © 2020 Intel Corporation -// -// SPDX-License-Identifier: MIT -// ============================================================= -// -// PrefixSum: this code sample implements the inclusive scan (prefix sum) in parallel. That -// is, given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and -// returns a new sequence y0, y1, y2, ..., yn so that -// -// y0 = x0 -// y1 = x0 + x1 -// y2 = x0 + x1 + x2 -// ..... -// yn = x0 + x1 + x2 + ... + xn -// -// Below is the pseudo code for computing prefix sum in parallel: -// -// n is power of 2 (1, 2, 4 , 8, 16, ...): -// -// for i from 0 to [log2 n] - 1 do -// for j from 0 to (n-1) do in parallel -// if j<2^i then -// x_{j}^{i+1} <- x_{j}^{i}} -// else -// x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}} -// -// In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i. -// Given n processors to perform each iteration of the inner loop in constant time, the algorithm as -// a whole runs in O(log n) time, the number of iterations of the outer loop. -// - -#include - -// dpc_common.hpp can be found in the dev-utilities include folder. -// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp -#include "dpc_common.hpp" - -using namespace sycl; -using namespace std; - -void Show(int a[], int arraysize) -{ - for (int i = 0; i < arraysize; ++i) - { - std::cout << a[i] << " "; - if ((i % 16) == 15) std::cout << "\n"; - } - - std::cout << "\n"; - return; -} - -int* ParallelPrefixSum(int* prefix1, int* prefix2, unsigned int nb, queue &q) -{ - unsigned int two_power = 1; - unsigned int num_iter = log2(nb); - //unsigned int uintmax = UINT_MAX; - int* result = NULL; - - // std::cout << "uintmax " << uintmax << " " << log2(uintmax) << "\n"; - // Buffer scope - { - buffer prefix1_buf(prefix1, range<1>{nb}); - buffer prefix2_buf(prefix2, range<1>{nb}); - - // Iterate over the necessary iterations. - for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) { - - // Submit command group for execution - q.submit([&](handler& h) { - // Create accessors - auto prefix1_acc = prefix1_buf.get_access(h); - auto prefix2_acc = prefix2_buf.get_access(h); - - if (iter % 2 == 0) { - h.parallel_for(range<1>(nb), [=](id<1> j) { - if (j < two_power) { - prefix2_acc[j] = prefix1_acc[j]; - } - else { - prefix2_acc[j] = prefix1_acc[j] + prefix1_acc[j - two_power]; - } - }); // end parallel for loop in kernel - result = prefix2; - //std::cout << "return prefix2\n"; - } - else { - h.parallel_for(range<1>(nb), [=](id<1> j) { - if (j < two_power) { - prefix1_acc[j] = prefix2_acc[j]; - } - else { - prefix1_acc[j] = prefix2_acc[j] + prefix2_acc[j - two_power]; - } - }); // end parallel for loop in kernel - result = prefix1; - //std::cout << "return prefix1\n"; - } - }); // end device queue - } // end iteration - } // Buffer scope - - // Wait for commands to complete. Enforce synchronization on the command queue - q.wait_and_throw(); - - return result; -} -/* -void PrefixSum(int* x, unsigned int nb) -{ - unsigned int two_power = 1; - unsigned int num_iter = log2(nb); - int temp = 0; - - // Iterate over the necessary iterations - for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) { - //Show(x, nb); - // std::cout << "two_power: " << two_power << "\n"; - for (unsigned int j = nb; j > 0; j--) { - if (j < two_power) { - x[j] = x[j]; - } - else { - x[j] = x[j] + x[j - two_power]; - } - } - } -} -*/ -void Usage(std::string prog_name, int exponent) { - std::cout << " Incorrect parameters\n"; - std::cout << " Usage: " << prog_name << " n k \n\n"; - std::cout << " n: Integer exponent presenting the size of the input array. The number of el\ -ement in\n"; - std::cout << " the array must be power of 2 (e.g., 1, 2, 4, ...). Please enter the corre\ -sponding\n"; - std::cout << " exponent betwwen 0 and " << exponent - 1 << ".\n"; - std::cout << " k: Seed used to generate a random sequence.\n"; -} - -int main(int argc, char* argv[]) { - unsigned int nb, seed; - int n, exp_max = log2(std::numeric_limits::max()); - - // Read parameters. - try { - n = std::stoi(argv[1]); - - // Verify the boundary of acceptance. - if (n < 0 || n >= exp_max) { - Usage(argv[0], exp_max); - return -1; - } - - seed = std::stoi(argv[2]); - nb = pow(2, n); - } catch (...) { - Usage(argv[0], exp_max); - return -1; - } - - std::cout << "\nSequence size: " << nb << ", seed: " << seed; - - int num_iter = log2(nb); - std::cout << "\nNum iteration: " << num_iter << "\n"; - - // Define device selector as 'default' - default_selector device_selector; - - // exception handler - auto exception_handler = [](exception_list exceptionList) { - for (std::exception_ptr const& e : exceptionList) { - try { - std::rethrow_exception(e); - } catch (cl::sycl::exception const& e) { - std::terminate(); - } - } - }; - - // Create a device queue using DPC++ class queue - queue q(device_selector, exception_handler); - - std::cout << "Device: " << q.get_device().get_info() << "\n"; - - int *data = new int[nb]; - int *prefix_sum1 = new int[nb]; - int *prefix_sum2 = new int[nb]; - int *result = NULL; - - srand(seed); - - // Initialize data arrays - for (int i = 0; i < nb; i++) { - data[i] = prefix_sum1[i] = rand() % 10; - prefix_sum2[i] = 0; - } - - // Start timer - auto start = std::chrono::steady_clock::now(); - - result = ParallelPrefixSum(prefix_sum1, prefix_sum2, nb, q); - - auto end = std::chrono::steady_clock::now(); - auto timeKern = std::chrono::duration_cast(end - start).count(); - std::cout << "Kernel time: " << timeKern << " ms" << "\n"; - - //std::cout << "\ndata after transforming using parallel prefix sum result:"; - //Show(result, nb); - - bool equal = true; - - if (result[0] != data[0]) - equal = false; - else - { - for (int i = 1; i < nb; i++) { - if (result[i] != result[i - 1] + data[i]) - { - equal = false; - break; - } - } - } - - delete[] data; - delete[] prefix_sum1; - delete[] prefix_sum2; - - if (!equal) { - std::cout << "\nFailed: " << std::endl; - return -2; - } - else { - std::cout << "\nSuccess!" << std::endl; - return 0; - } -} From 9de715d9102c6610259d8612feaa35d3943a172f Mon Sep 17 00:00:00 2001 From: Loc Nguyen Date: Tue, 18 Aug 2020 15:13:37 -0700 Subject: [PATCH 9/9] New code sample PrefixSum in ParallelPatterns. Signed-off-by: Loc Nguyen --- .../ParallelPatterns/PrefixSum/CMakeLists.txt | 30 +++ .../ParallelPatterns/PrefixSum/License.txt | 7 + .../ParallelPatterns/PrefixSum/PrefixSum.sln | 25 ++ .../PrefixSum/PrefixSum.vcxproj | 137 ++++++++++ .../PrefixSum/PrefixSum.vcxproj.filters | 22 ++ .../PrefixSum/PrefixSum.vcxproj.user | 11 + .../ParallelPatterns/PrefixSum/README.md | 124 +++++++++ .../ParallelPatterns/PrefixSum/sample.json | 29 +++ .../PrefixSum/src/PrefixSum.cpp | 239 ++++++++++++++++++ 9 files changed, 624 insertions(+) create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json create mode 100644 DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt new file mode 100644 index 0000000000..85fcec4963 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt @@ -0,0 +1,30 @@ +# required cmake version +cmake_minimum_required(VERSION 3.5) + +project (PrefixSum) + +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp") +else() + set(CMAKE_CXX_COMPILER "dpcpp") +endif() + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17") + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl") + +add_executable (PrefixSum src/PrefixSum.cpp) + +add_custom_target (run + COMMAND PrefixSum 21 47 + WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} +) + diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt new file mode 100644 index 0000000000..415025cf03 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln new file mode 100644 index 0000000000..3587a92e74 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.29926.136 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "PrefixSum", "PrefixSum.vcxproj", "{BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.ActiveCfg = Debug|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.Build.0 = Debug|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.ActiveCfg = Release|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {9B9594EB-112B-4FAE-AD1F-04BD8FF34B9F} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj new file mode 100644 index 0000000000..6a6309b96b --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj @@ -0,0 +1,137 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + 15.0 + {bc12abe6-7951-47d6-93dc-126f8a5fcfd2} + Win32Proj + PrefixSum + 10.0.17763.0 + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + + + + + Console + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include + + + Console + true + + + + + + + + + + + Console + true + true + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters new file mode 100644 index 0000000000..2003dce0f2 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters @@ -0,0 +1,22 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user new file mode 100644 index 0000000000..7288fa06dd --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user @@ -0,0 +1,11 @@ + + + + 21 47 + WindowsLocalDebugger + + + 21 47 + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md new file mode 100644 index 0000000000..6bbc2cfdfb --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md @@ -0,0 +1,124 @@ +# `Prefix Sum` sample + +This code sample demonstrates the implementation of parallel prefix sum using Intel Data Parallel C++ to +offload the computation to a GPU. In this implementation, a random sequence of 2**n elements is given +(n is a positive number) as input, the algorithm compute the prefix sum in parallel. The result sequence is +in ascending order. + +For comprehensive instructions regarding DPC++ Programming, go to +https://software.intel.com/en-us/oneapi-programming-guide +and search based on relevant terms noted in the comments. + +| Optimized for | Description +|:--- |:--- +| OS | Linux Ubuntu 18.04 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) +| What you will learn | Implement bitonic sort using Intel DPC++ compiler +| Time to complete | 15 minutes + + +## Purpose + +Given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and returns +a new sequence y0, y1, y2, ..., yn so that + +y0 = x0 +y1 = x0 + x1 +y2 = x0 + x1 + x2 +..... +yn = x0 + x1 + x2 + ... + xn + +Below is the pseudo code for computing prefix sum in parallel: + +n is power of 2 (1, 2, 4 , 8, 16, ...): + +for i from 0 to [log2 n] - 1 do + for j from 0 to (n-1) do in parallel + if j<2^i then + x_{j}^{i+1} <- x_{j}^{i}} + else + x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}} + +In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i. +Given n processors to perform each iteration of the inner loop in constant time, the algorithm +as a whole runs in O(log n) time, the number of iterations of the outer loop. + +The code will attempt first to execute on an available GPU and fallback to the system's CPU if a +compatible GPU is not detected. + +## Key Implementation Details + +The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command +groups. + +## License +This code sample is licensed under MIT license + +## Building the `PrefixSum` Program for CPU and GPU + +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, +FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI +Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System +1. Build the program using the following `cmake` commands. + ``` + $ cd PrefixSum + $ mkdir build + $ cd build + $ cmake .. + $ make + ``` + +2. Run the program: + ``` + make run + ``` + +3. Clean the program using: + ``` + make clean + ``` + +### On a Windows* System + * Build the program using VS2017 or VS2019 + Right click on the solution file and open using either VS2017 or VS2019 IDE. + Right click on the project in Solution explorer and select Rebuild. + From top menu select Debug -> Start without Debugging. + + * Build the program using MSBuild + Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for + VS2019" + Run - MSBuild PrefixSum.sln /t:Rebuild /p:Configuration="Release" + +## Running the sample +### Application Parameters + + Usage: PrefixSum + +where + +exponent is a positive number. The according length of the sequence is 2**exponent. + +seed is the seed used by the random generator to generate the randomness. + +The sample offloads the computation to GPU and then performs the verification the results in the CPU. +The results are verified if yk = yk-1 + xk the original compared. If the results are matched and +the ascending order is verified, the application will display a “Success!” message. + +### Example of Output +``` +$ ./PrefixSum 21 47 + +Sequence size: 2097152, seed: 47 +Num iteration: 21 +Device: Intel(R) Gen9 HD Graphics NEO +Kernel time: 170 ms + +Success! +``` diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json new file mode 100644 index 0000000000..def268a2f8 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json @@ -0,0 +1,29 @@ +{ + "guid": "5D274319-02EE-44B0-B055-71E4C50D05E0", + "name": "PrefixSum", + "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ], + "description": "Compute Prefix Sum using Intel® oneAPI DPC++ Language", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild PrefixSum.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "PrefixSum.exe 21 47" + ] + }] + } +} diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp new file mode 100644 index 0000000000..b2af8367a7 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp @@ -0,0 +1,239 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// PrefixSum: this code sample implements the inclusive scan (prefix sum) in parallel. That +// is, given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and +// returns a new sequence y0, y1, y2, ..., yn so that +// +// y0 = x0 +// y1 = x0 + x1 +// y2 = x0 + x1 + x2 +// ..... +// yn = x0 + x1 + x2 + ... + xn +// +// Below is the pseudo code for computing prefix sum in parallel: +// +// n is power of 2 (1, 2, 4 , 8, 16, ...): +// +// for i from 0 to [log2 n] - 1 do +// for j from 0 to (n-1) do in parallel +// if j<2^i then +// x_{j}^{i+1} <- x_{j}^{i}} +// else +// x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}} +// +// In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i. +// Given n processors to perform each iteration of the inner loop in constant time, the algorithm as +// a whole runs in O(log n) time, the number of iterations of the outer loop. +// + +#include + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp +#include "dpc_common.hpp" + +using namespace sycl; +using namespace std; + +void Show(int a[], int arraysize) +{ + for (int i = 0; i < arraysize; ++i) + { + std::cout << a[i] << " "; + if ((i % 16) == 15) std::cout << "\n"; + } + + std::cout << "\n"; + return; +} + +int* ParallelPrefixSum(int* prefix1, int* prefix2, unsigned int nb, queue &q) +{ + unsigned int two_power = 1; + unsigned int num_iter = log2(nb); + //unsigned int uintmax = UINT_MAX; + int* result = NULL; + + // std::cout << "uintmax " << uintmax << " " << log2(uintmax) << "\n"; + // Buffer scope + { + buffer prefix1_buf(prefix1, range<1>{nb}); + buffer prefix2_buf(prefix2, range<1>{nb}); + + // Iterate over the necessary iterations. + for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) { + + // Submit command group for execution + q.submit([&](handler& h) { + // Create accessors + auto prefix1_acc = prefix1_buf.get_access(h); + auto prefix2_acc = prefix2_buf.get_access(h); + + if (iter % 2 == 0) { + h.parallel_for(range<1>(nb), [=](id<1> j) { + if (j < two_power) { + prefix2_acc[j] = prefix1_acc[j]; + } + else { + prefix2_acc[j] = prefix1_acc[j] + prefix1_acc[j - two_power]; + } + }); // end parallel for loop in kernel + result = prefix2; + //std::cout << "return prefix2\n"; + } + else { + h.parallel_for(range<1>(nb), [=](id<1> j) { + if (j < two_power) { + prefix1_acc[j] = prefix2_acc[j]; + } + else { + prefix1_acc[j] = prefix2_acc[j] + prefix2_acc[j - two_power]; + } + }); // end parallel for loop in kernel + result = prefix1; + //std::cout << "return prefix1\n"; + } + }); // end device queue + } // end iteration + } // Buffer scope + + // Wait for commands to complete. Enforce synchronization on the command queue + q.wait_and_throw(); + + return result; +} +/* +void PrefixSum(int* x, unsigned int nb) +{ + unsigned int two_power = 1; + unsigned int num_iter = log2(nb); + int temp = 0; + + // Iterate over the necessary iterations + for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) { + //Show(x, nb); + // std::cout << "two_power: " << two_power << "\n"; + for (unsigned int j = nb; j > 0; j--) { + if (j < two_power) { + x[j] = x[j]; + } + else { + x[j] = x[j] + x[j - two_power]; + } + } + } +} +*/ +void Usage(std::string prog_name, int exponent) { + std::cout << " Incorrect parameters\n"; + std::cout << " Usage: " << prog_name << " n k \n\n"; + std::cout << " n: Integer exponent presenting the size of the input array. The number of el\ +ement in\n"; + std::cout << " the array must be power of 2 (e.g., 1, 2, 4, ...). Please enter the corre\ +sponding\n"; + std::cout << " exponent betwwen 0 and " << exponent - 1 << ".\n"; + std::cout << " k: Seed used to generate a random sequence.\n"; +} + +int main(int argc, char* argv[]) { + unsigned int nb, seed; + int n, exp_max = log2(std::numeric_limits::max()); + + // Read parameters. + try { + n = std::stoi(argv[1]); + + // Verify the boundary of acceptance. + if (n < 0 || n >= exp_max) { + Usage(argv[0], exp_max); + return -1; + } + + seed = std::stoi(argv[2]); + nb = pow(2, n); + } catch (...) { + Usage(argv[0], exp_max); + return -1; + } + + std::cout << "\nSequence size: " << nb << ", seed: " << seed; + + int num_iter = log2(nb); + std::cout << "\nNum iteration: " << num_iter << "\n"; + + // Define device selector as 'default' + default_selector device_selector; + + // exception handler + auto exception_handler = [](exception_list exceptionList) { + for (std::exception_ptr const& e : exceptionList) { + try { + std::rethrow_exception(e); + } catch (cl::sycl::exception const& e) { + std::terminate(); + } + } + }; + + // Create a device queue using DPC++ class queue + queue q(device_selector, exception_handler); + + std::cout << "Device: " << q.get_device().get_info() << "\n"; + + int *data = new int[nb]; + int *prefix_sum1 = new int[nb]; + int *prefix_sum2 = new int[nb]; + int *result = NULL; + + srand(seed); + + // Initialize data arrays + for (int i = 0; i < nb; i++) { + data[i] = prefix_sum1[i] = rand() % 10; + prefix_sum2[i] = 0; + } + + // Start timer + auto start = std::chrono::steady_clock::now(); + + result = ParallelPrefixSum(prefix_sum1, prefix_sum2, nb, q); + + auto end = std::chrono::steady_clock::now(); + auto timeKern = std::chrono::duration_cast(end - start).count(); + std::cout << "Kernel time: " << timeKern << " ms" << "\n"; + + //std::cout << "\ndata after transforming using parallel prefix sum result:"; + //Show(result, nb); + + bool equal = true; + + if (result[0] != data[0]) + equal = false; + else + { + for (int i = 1; i < nb; i++) { + if (result[i] != result[i - 1] + data[i]) + { + equal = false; + break; + } + } + } + + delete[] data; + delete[] prefix_sum1; + delete[] prefix_sum2; + + if (!equal) { + std::cout << "\nFailed: " << std::endl; + return -2; + } + else { + std::cout << "\nSuccess!" << std::endl; + return 0; + } +}