diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt new file mode 100644 index 0000000000..85fcec4963 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt @@ -0,0 +1,30 @@ +# required cmake version +cmake_minimum_required(VERSION 3.5) + +project (PrefixSum) + +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp") +else() + set(CMAKE_CXX_COMPILER "dpcpp") +endif() + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17") + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl") + +add_executable (PrefixSum src/PrefixSum.cpp) + +add_custom_target (run + COMMAND PrefixSum 21 47 + WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} +) + diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt new file mode 100644 index 0000000000..415025cf03 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln new file mode 100644 index 0000000000..3587a92e74 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.29926.136 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "PrefixSum", "PrefixSum.vcxproj", "{BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.ActiveCfg = Debug|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.Build.0 = Debug|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.ActiveCfg = Release|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {9B9594EB-112B-4FAE-AD1F-04BD8FF34B9F} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj new file mode 100644 index 0000000000..6a6309b96b --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj @@ -0,0 +1,137 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + 15.0 + {bc12abe6-7951-47d6-93dc-126f8a5fcfd2} + Win32Proj + PrefixSum + 10.0.17763.0 + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + + + + + Console + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include + + + Console + true + + + + + + + + + + + Console + true + true + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters new file mode 100644 index 0000000000..2003dce0f2 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters @@ -0,0 +1,22 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user new file mode 100644 index 0000000000..7288fa06dd --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user @@ -0,0 +1,11 @@ + + + + 21 47 + WindowsLocalDebugger + + + 21 47 + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md new file mode 100644 index 0000000000..6bbc2cfdfb --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md @@ -0,0 +1,124 @@ +# `Prefix Sum` sample + +This code sample demonstrates the implementation of parallel prefix sum using Intel Data Parallel C++ to +offload the computation to a GPU. In this implementation, a random sequence of 2**n elements is given +(n is a positive number) as input, the algorithm compute the prefix sum in parallel. The result sequence is +in ascending order. + +For comprehensive instructions regarding DPC++ Programming, go to +https://software.intel.com/en-us/oneapi-programming-guide +and search based on relevant terms noted in the comments. + +| Optimized for | Description +|:--- |:--- +| OS | Linux Ubuntu 18.04 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) +| What you will learn | Implement bitonic sort using Intel DPC++ compiler +| Time to complete | 15 minutes + + +## Purpose + +Given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and returns +a new sequence y0, y1, y2, ..., yn so that + +y0 = x0 +y1 = x0 + x1 +y2 = x0 + x1 + x2 +..... +yn = x0 + x1 + x2 + ... + xn + +Below is the pseudo code for computing prefix sum in parallel: + +n is power of 2 (1, 2, 4 , 8, 16, ...): + +for i from 0 to [log2 n] - 1 do + for j from 0 to (n-1) do in parallel + if j<2^i then + x_{j}^{i+1} <- x_{j}^{i}} + else + x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}} + +In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i. +Given n processors to perform each iteration of the inner loop in constant time, the algorithm +as a whole runs in O(log n) time, the number of iterations of the outer loop. + +The code will attempt first to execute on an available GPU and fallback to the system's CPU if a +compatible GPU is not detected. + +## Key Implementation Details + +The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command +groups. + +## License +This code sample is licensed under MIT license + +## Building the `PrefixSum` Program for CPU and GPU + +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, +FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI +Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System +1. Build the program using the following `cmake` commands. + ``` + $ cd PrefixSum + $ mkdir build + $ cd build + $ cmake .. + $ make + ``` + +2. Run the program: + ``` + make run + ``` + +3. Clean the program using: + ``` + make clean + ``` + +### On a Windows* System + * Build the program using VS2017 or VS2019 + Right click on the solution file and open using either VS2017 or VS2019 IDE. + Right click on the project in Solution explorer and select Rebuild. + From top menu select Debug -> Start without Debugging. + + * Build the program using MSBuild + Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for + VS2019" + Run - MSBuild PrefixSum.sln /t:Rebuild /p:Configuration="Release" + +## Running the sample +### Application Parameters + + Usage: PrefixSum + +where + +exponent is a positive number. The according length of the sequence is 2**exponent. + +seed is the seed used by the random generator to generate the randomness. + +The sample offloads the computation to GPU and then performs the verification the results in the CPU. +The results are verified if yk = yk-1 + xk the original compared. If the results are matched and +the ascending order is verified, the application will display a “Success!” message. + +### Example of Output +``` +$ ./PrefixSum 21 47 + +Sequence size: 2097152, seed: 47 +Num iteration: 21 +Device: Intel(R) Gen9 HD Graphics NEO +Kernel time: 170 ms + +Success! +``` diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json new file mode 100644 index 0000000000..def268a2f8 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json @@ -0,0 +1,29 @@ +{ + "guid": "5D274319-02EE-44B0-B055-71E4C50D05E0", + "name": "PrefixSum", + "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ], + "description": "Compute Prefix Sum using Intel® oneAPI DPC++ Language", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild PrefixSum.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "PrefixSum.exe 21 47" + ] + }] + } +} diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp new file mode 100644 index 0000000000..b2af8367a7 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp @@ -0,0 +1,239 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// PrefixSum: this code sample implements the inclusive scan (prefix sum) in parallel. That +// is, given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and +// returns a new sequence y0, y1, y2, ..., yn so that +// +// y0 = x0 +// y1 = x0 + x1 +// y2 = x0 + x1 + x2 +// ..... +// yn = x0 + x1 + x2 + ... + xn +// +// Below is the pseudo code for computing prefix sum in parallel: +// +// n is power of 2 (1, 2, 4 , 8, 16, ...): +// +// for i from 0 to [log2 n] - 1 do +// for j from 0 to (n-1) do in parallel +// if j<2^i then +// x_{j}^{i+1} <- x_{j}^{i}} +// else +// x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}} +// +// In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i. +// Given n processors to perform each iteration of the inner loop in constant time, the algorithm as +// a whole runs in O(log n) time, the number of iterations of the outer loop. +// + +#include + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp +#include "dpc_common.hpp" + +using namespace sycl; +using namespace std; + +void Show(int a[], int arraysize) +{ + for (int i = 0; i < arraysize; ++i) + { + std::cout << a[i] << " "; + if ((i % 16) == 15) std::cout << "\n"; + } + + std::cout << "\n"; + return; +} + +int* ParallelPrefixSum(int* prefix1, int* prefix2, unsigned int nb, queue &q) +{ + unsigned int two_power = 1; + unsigned int num_iter = log2(nb); + //unsigned int uintmax = UINT_MAX; + int* result = NULL; + + // std::cout << "uintmax " << uintmax << " " << log2(uintmax) << "\n"; + // Buffer scope + { + buffer prefix1_buf(prefix1, range<1>{nb}); + buffer prefix2_buf(prefix2, range<1>{nb}); + + // Iterate over the necessary iterations. + for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) { + + // Submit command group for execution + q.submit([&](handler& h) { + // Create accessors + auto prefix1_acc = prefix1_buf.get_access(h); + auto prefix2_acc = prefix2_buf.get_access(h); + + if (iter % 2 == 0) { + h.parallel_for(range<1>(nb), [=](id<1> j) { + if (j < two_power) { + prefix2_acc[j] = prefix1_acc[j]; + } + else { + prefix2_acc[j] = prefix1_acc[j] + prefix1_acc[j - two_power]; + } + }); // end parallel for loop in kernel + result = prefix2; + //std::cout << "return prefix2\n"; + } + else { + h.parallel_for(range<1>(nb), [=](id<1> j) { + if (j < two_power) { + prefix1_acc[j] = prefix2_acc[j]; + } + else { + prefix1_acc[j] = prefix2_acc[j] + prefix2_acc[j - two_power]; + } + }); // end parallel for loop in kernel + result = prefix1; + //std::cout << "return prefix1\n"; + } + }); // end device queue + } // end iteration + } // Buffer scope + + // Wait for commands to complete. Enforce synchronization on the command queue + q.wait_and_throw(); + + return result; +} +/* +void PrefixSum(int* x, unsigned int nb) +{ + unsigned int two_power = 1; + unsigned int num_iter = log2(nb); + int temp = 0; + + // Iterate over the necessary iterations + for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) { + //Show(x, nb); + // std::cout << "two_power: " << two_power << "\n"; + for (unsigned int j = nb; j > 0; j--) { + if (j < two_power) { + x[j] = x[j]; + } + else { + x[j] = x[j] + x[j - two_power]; + } + } + } +} +*/ +void Usage(std::string prog_name, int exponent) { + std::cout << " Incorrect parameters\n"; + std::cout << " Usage: " << prog_name << " n k \n\n"; + std::cout << " n: Integer exponent presenting the size of the input array. The number of el\ +ement in\n"; + std::cout << " the array must be power of 2 (e.g., 1, 2, 4, ...). Please enter the corre\ +sponding\n"; + std::cout << " exponent betwwen 0 and " << exponent - 1 << ".\n"; + std::cout << " k: Seed used to generate a random sequence.\n"; +} + +int main(int argc, char* argv[]) { + unsigned int nb, seed; + int n, exp_max = log2(std::numeric_limits::max()); + + // Read parameters. + try { + n = std::stoi(argv[1]); + + // Verify the boundary of acceptance. + if (n < 0 || n >= exp_max) { + Usage(argv[0], exp_max); + return -1; + } + + seed = std::stoi(argv[2]); + nb = pow(2, n); + } catch (...) { + Usage(argv[0], exp_max); + return -1; + } + + std::cout << "\nSequence size: " << nb << ", seed: " << seed; + + int num_iter = log2(nb); + std::cout << "\nNum iteration: " << num_iter << "\n"; + + // Define device selector as 'default' + default_selector device_selector; + + // exception handler + auto exception_handler = [](exception_list exceptionList) { + for (std::exception_ptr const& e : exceptionList) { + try { + std::rethrow_exception(e); + } catch (cl::sycl::exception const& e) { + std::terminate(); + } + } + }; + + // Create a device queue using DPC++ class queue + queue q(device_selector, exception_handler); + + std::cout << "Device: " << q.get_device().get_info() << "\n"; + + int *data = new int[nb]; + int *prefix_sum1 = new int[nb]; + int *prefix_sum2 = new int[nb]; + int *result = NULL; + + srand(seed); + + // Initialize data arrays + for (int i = 0; i < nb; i++) { + data[i] = prefix_sum1[i] = rand() % 10; + prefix_sum2[i] = 0; + } + + // Start timer + auto start = std::chrono::steady_clock::now(); + + result = ParallelPrefixSum(prefix_sum1, prefix_sum2, nb, q); + + auto end = std::chrono::steady_clock::now(); + auto timeKern = std::chrono::duration_cast(end - start).count(); + std::cout << "Kernel time: " << timeKern << " ms" << "\n"; + + //std::cout << "\ndata after transforming using parallel prefix sum result:"; + //Show(result, nb); + + bool equal = true; + + if (result[0] != data[0]) + equal = false; + else + { + for (int i = 1; i < nb; i++) { + if (result[i] != result[i - 1] + data[i]) + { + equal = false; + break; + } + } + } + + delete[] data; + delete[] prefix_sum1; + delete[] prefix_sum2; + + if (!equal) { + std::cout << "\nFailed: " << std::endl; + return -2; + } + else { + std::cout << "\nSuccess!" << std::endl; + return 0; + } +}