diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/src/matrix_mul_dpcpp.cpp b/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/src/matrix_mul_dpcpp.cpp index 3f2b0ced6c..87bacfec9e 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/src/matrix_mul_dpcpp.cpp +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/src/matrix_mul_dpcpp.cpp @@ -58,9 +58,9 @@ int main() { // Create 2D buffers for matrices, buffer c is bound with host memory c_back - buffer a(range(M, N)); - buffer b(range(N, P)); - buffer c(reinterpret_cast(c_back), range(M, P)); + buffer a_buf(range(M, N)); + buffer b_buf(range(N, P)); + buffer c_buf(reinterpret_cast(c_back), range(M, P)); cout << "Problem size: c(" << M << "," << P << ") = a(" << M << "," << N << ") * b(" << N << "," << P << ")\n"; @@ -71,40 +71,40 @@ int main() { // execution ordering. // Submit command group to queue to initialize matrix a - q.submit([&](handler &h) { + q.submit([&](auto &h) { // Get write only access to the buffer on a device. - auto accessor = a.get_access(h); + accessor a(a_buf, h, write_only); // Execute kernel. - h.parallel_for(range(M, N), [=](id<2> index) { + h.parallel_for(range(M, N), [=](auto index) { // Each element of matrix a is 1. - accessor[index] = 1.0f; + a[index] = 1.0f; }); }); // Submit command group to queue to initialize matrix b - q.submit([&](handler &h) { + q.submit([&](auto &h) { // Get write only access to the buffer on a device - auto accessor = b.get_access(h); + accessor b(b_buf, h, write_only); // Execute kernel. - h.parallel_for(range(N, P), [=](id<2> index) { + h.parallel_for(range(N, P), [=](auto index) { // Each column of b is the sequence 1,2,...,N - accessor[index] = index[0] + 1.0f; + b[index] = index[0] + 1.0f; }); }); // Submit command group to queue to multiply matrices: c = a * b - q.submit([&](handler &h) { + q.submit([&](auto &h) { // Read from a and b, write to c - auto A = a.get_access(h); - auto B = b.get_access(h); - auto C = c.get_access(h); + accessor A(a_buf, h, read_only); + accessor B(b_buf, h, read_only); + accessor C(c_buf, h, write_only); - int width_a = a.get_range()[1]; + int width_a = a_buf.get_range()[1]; // Execute kernel. - h.parallel_for(range(M, P), [=](id<2> index) { + h.parallel_for(range(M, P), [=](auto index) { // Get global position in Y direction. int row = index[0]; // Get global position in X direction. diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-buffers.cpp b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-buffers.cpp index 349ffe2026..ec74f276a9 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-buffers.cpp +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-buffers.cpp @@ -22,9 +22,21 @@ #include #include #include + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp #include "dpc_common.hpp" + #if FPGA || FPGA_EMULATOR -#include +// Header locations and some DPC++ extensions changed between beta09 and beta10 +// Temporarily modify the code sample to accept either version +#define BETA09 20200827 +#if __SYCL_COMPILER_VERSION <= BETA09 + #include + namespace INTEL = sycl::intel; // Namespace alias for backward compatibility +#else + #include +#endif #endif using namespace sycl; @@ -39,18 +51,18 @@ typedef array IntArray; //************************************ void IotaParallel(queue &q, IntArray &a_array, int value) { // Create the range object for the array managed by the buffer. - range<1> num_items{a_array.size()}; + range num_items{a_array.size()}; // Create buffer that hold the data shared between the host and the devices. // The buffer destructor is responsible to copy the data back to host when it // goes out of scope. - buffer a_buf(a_array.data(), num_items); + buffer a_buf(a_array); // Submit a command group to the queue by a lambda function that contains the // data access permission and device computation (kernel). - q.submit([&](handler &h) { + q.submit([&](auto &h) { // Create an accessor with write permission. - auto a = a_buf.get_access(h); + accessor a(a_buf, h, write_only); // Use parallel_for to populate consecutive numbers starting with a // specified value in parallel on device. This executes the kernel. @@ -58,7 +70,7 @@ void IotaParallel(queue &q, IntArray &a_array, int value) { // 2nd parameter is the kernel, a lambda that specifies what to do per // work item. The parameter of the lambda is the work item id. // DPC++ supports unnamed lambda kernel by default. - h.parallel_for(num_items, [=](id<1> i) { a[i] = value + i; }); + h.parallel_for(num_items, [=](auto i) { a[i] = value + i; }); }); } @@ -69,10 +81,10 @@ int main() { // Create device selector for the device of your interest. #if FPGA_EMULATOR // DPC++ extension: FPGA emulator selector on systems without FPGA card. - intel::fpga_emulator_selector d_selector; + INTEL::fpga_emulator_selector d_selector; #elif FPGA // DPC++ extension: FPGA selector on systems with FPGA card. - intel::fpga_selector d_selector; + INTEL::fpga_selector d_selector; #else // The default device selector will select the most performant device. default_selector d_selector; @@ -96,8 +108,8 @@ int main() { // Parallel iota in DPC++. IotaParallel(q, parallel, value); } catch (std::exception const &e) { - cout << "An exception is caught while computing on device.\n"; - terminate(); + cout << "An exception is caught while computing on device.\n"; + terminate(); } // Verify two results are equal. @@ -114,7 +126,7 @@ int main() { // Print out iota result. for (int i = 0; i < indices_size; i++) { int j = indices[i]; - if (i == indices_size - 1) std::cout << "...\n"; + if (i == indices_size - 1) cout << "...\n"; cout << "[" << j << "]: " << j << " + " << value << " = " << parallel[j] << "\n"; } diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-usm.cpp b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-usm.cpp index 1f37eaa361..d9480e9972 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-usm.cpp +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-usm.cpp @@ -22,9 +22,21 @@ #include #include #include + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp #include "dpc_common.hpp" + #if FPGA || FPGA_EMULATOR -#include +// Header locations and some DPC++ extensions changed between beta09 and beta10 +// Temporarily modify the code sample to accept either version +#define BETA09 20200827 +#if __SYCL_COMPILER_VERSION <= BETA09 + #include + namespace INTEL = sycl::intel; // Namespace alias for backward compatibility +#else + #include +#endif #endif using namespace sycl; @@ -38,7 +50,7 @@ constexpr size_t array_size = 10000; //************************************ void IotaParallel(queue &q, int *a, size_t size, int value) { // Create the range object for the array. - range<1> num_items{size}; + range num_items{size}; // Use parallel_for to populate consecutive numbers starting with a specified // value in parallel on device. This executes the kernel. @@ -46,7 +58,7 @@ void IotaParallel(queue &q, int *a, size_t size, int value) { // 2nd parameter is the kernel, a lambda that specifies what to do per // work item. The parameter of the lambda is the work item id. // DPC++ supports unnamed lambda kernel by default. - auto e = q.parallel_for(num_items, [=](id<1> i) { a[i] = value + i; }); + auto e = q.parallel_for(num_items, [=](auto i) { a[i] = value + i; }); // q.parallel_for() is an asynchronous call. DPC++ runtime enqueues and runs // the kernel asynchronously. Wait for the asynchronous call to complete. @@ -60,10 +72,10 @@ int main() { // Create device selector for the device of your interest. #if FPGA_EMULATOR // DPC++ extension: FPGA emulator selector on systems without FPGA card. - intel::fpga_emulator_selector d_selector; + INTEL::fpga_emulator_selector d_selector; #elif FPGA // DPC++ extension: FPGA selector on systems with FPGA card. - intel::fpga_selector d_selector; + INTEL::fpga_selector d_selector; #else // The default device selector will select the most performant device. default_selector d_selector; @@ -110,16 +122,16 @@ int main() { // Print out iota result. for (int i = 0; i < indices_size; i++) { int j = indices[i]; - if (i == indices_size - 1) std::cout << "...\n"; - cout << "[" << j << "]: " << j << " + " << value << " = " + if (i == indices_size - 1) cout << "...\n"; + cout << "[" << j << "]: " << j << " + " << value << " = " << sequential[j] << "\n"; } free(sequential, q); free(parallel, q); } catch (std::exception const &e) { - cout << "An exception is caught while computing on device.\n"; - terminate(); + cout << "An exception is caught while computing on device.\n"; + terminate(); } cout << "Successfully completed on device.\n"; diff --git a/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/CMakeLists.txt b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/CMakeLists.txt new file mode 100644 index 0000000000..e7fd3619bd --- /dev/null +++ b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/CMakeLists.txt @@ -0,0 +1,23 @@ +cmake_minimum_required (VERSION 3.5) +project (all-pairs-shortest-paths) + +set(CMAKE_CXX_COMPILER dpcpp) + + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++17") + +add_executable(apsp src/apsp.cpp) + +if(WIN32) +add_custom_target(run apsp.exe) +else() +add_custom_target(run ./apsp) +endif() \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/License.txt b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/License.txt new file mode 100644 index 0000000000..0abc121bd7 --- /dev/null +++ b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/README.md b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/README.md new file mode 100644 index 0000000000..23db7ecd8a --- /dev/null +++ b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/README.md @@ -0,0 +1,79 @@ +# All Pairs Shortest Paths sample +`All Pairs Shortest Paths` uses the Floyd Warshall algorithm to find the shortest paths between pairs of vertices in a graph. It uses a parallel blocked algorithm that enables the application to efficiently offload compute intensive work to the GPU. + +For comprehensive instructions regarding DPC++ Programming, go to https://software.intel.com/en-us/oneapi-programming-guide and search based on relevant terms noted in the comments. + +| Optimized for | Description +|:--- |:--- +| OS | Linux Ubuntu 18.04, Windows 10 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++/C++ Compiler +| What you will learn | The All Pairs Shortest Paths sample demonstrates the following using the Intel® oneAPI DPC++/C++ Compiler
  • Offloading compute intensive parts of the application using lambda kernel
  • Measuring kernel execution time
+| Time to complete | 15 minutes + +## Purpose +This sample uses blocked Floyd-Warshall all pairs shortest paths algorithm to compute a matrix that represents the minimum distance from any node to all other nodes in the graph. Using parallel blocked processing, blocks can be calculated simultaneously by distributing task computations to the GPU. + +## Key implementation details +The basic DPC++ implementation explained in the code includes device selector, unified shared memory, kernel, and command groups. + +The parallel implementation of blocked Floyd Warshall algorithm has three phases. Given a prior round of these computation phases are complete, phase 1 is independent; Phase 2 can only execute after phase 1 completes; Similarly phase 3 depends on phase 2 so can only execute after phase 2 is complete. + +The inner loop of the sequential implementation is: + g[i][j] = min(g[i][j], g[i][k] + g[k][j]) + +A careful observation shows that for the kth iteration of the outer loop, the computation depends on cells either on the kth column, g[i][k] or on the kth row, g[k][j] of the graph. Phase 1 handles g[k][k], phase 2 handles g[\*][k] and g[k][\*], and phase 3 handles g[\*][\*] in that sequence. This cell level observations largely propagate to the blocks as well. + +In each phase computation within a block can proceed independently in parallel. + +## License +This code sample is licensed under MIT license + +## Building the Program for CPU and GPU + +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System + +Perform the following steps: + +1. Build the program using the following cmake commands. +``` + $ cd all-pairs-shortest-paths + $ mkdir build + $ cd build + $ cmake .. + $ make +``` + +2. Run the program
+``` + $ make run + +``` + +### On a Windows* System Using Visual Studio* version 2017 or Newer + +* Build the program using VS2017 or VS2019: Right click on the solution file and open using either VS2017 or VS2019 IDE. Right click on the project in Solution explorer and select Rebuild. From top menu select Debug -> Start without Debugging. +* Build the program using MSBuild: Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for VS2019". Run - MSBuild all-pairs-shortest-paths.sln /t:Rebuild /p:Configuration="Release" + +## Running the sample + +### Example Output +``` +Device: Intel(R) Gen9 +Repeating computation 8 times to measure run time ... +Iteration: 1 +Iteration: 2 +Iteration: 3 +... +Iteration: 8 +Successfully computed all pairs shortest paths in parallel! +Time sequential: 0.583029 sec +Time parallel: 0.159223 sec + +``` diff --git a/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/all-pairs-shortest-paths.sln b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/all-pairs-shortest-paths.sln new file mode 100644 index 0000000000..ecdfab1dcc --- /dev/null +++ b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/all-pairs-shortest-paths.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.30104.148 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "apsp", "all-pairs-shortest-paths.vcxproj", "{28C45A93-5D80-4635-BAFB-D5386EEEA466}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {28C45A93-5D80-4635-BAFB-D5386EEEA466}.Debug|x64.ActiveCfg = Debug|x64 + {28C45A93-5D80-4635-BAFB-D5386EEEA466}.Debug|x64.Build.0 = Debug|x64 + {28C45A93-5D80-4635-BAFB-D5386EEEA466}.Release|x64.ActiveCfg = Release|x64 + {28C45A93-5D80-4635-BAFB-D5386EEEA466}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {2ADBAE67-858D-4FA7-B81F-3C9AF9347EFA} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/all-pairs-shortest-paths.vcxproj b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/all-pairs-shortest-paths.vcxproj new file mode 100644 index 0000000000..909207a456 --- /dev/null +++ b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/all-pairs-shortest-paths.vcxproj @@ -0,0 +1,87 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {28c45a93-5d80-4635-bafb-d5386eeea466} + Win32Proj + all_pairs_shortest_paths + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + true + + + false + + + + + + + + Level3 + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + + + + + + + + + Level3 + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/sample.json b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/sample.json new file mode 100644 index 0000000000..62c939ead1 --- /dev/null +++ b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/sample.json @@ -0,0 +1,29 @@ +{ + "guid": "4F0F5FBD-C237-4A9F-B259-2C56ABFB40D9", + "name": "all-pairs-shortest-paths", + "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++/C++ Compiler/CPU and GPU" ], + "description": "all-pairs-shortest-paths using Intel® oneAPI DPC++ Language", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild all-pairs-shortest-paths.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "apsp.exe" + ] + }] + } + } diff --git a/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/src/apsp.cpp b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/src/apsp.cpp new file mode 100644 index 0000000000..fb201fa871 --- /dev/null +++ b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/src/apsp.cpp @@ -0,0 +1,369 @@ +//============================================================== +// This sample provides a parallel implementation of blocked Floyd Warshall +// algorithm to compute all pairs shortest paths using DPC++. +//============================================================== +// Copyright © Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#include +#include +#include +#include + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp +#include "dpc_common.hpp" + +using namespace std; +using namespace sycl; + +// Number of nodes in the graph. +constexpr int nodes = 1024; + +// Block length and block count (along a single dimension). +constexpr int block_length = 16; +constexpr int block_count = (nodes / block_length); + +// Maximum distance between two adjacent nodes. +constexpr int max_distance = 100; +constexpr int infinite = (nodes * max_distance); + +// Number of repetitions. +constexpr int repetitions = 8; + +// Randomly initialize directed graph. +void InitializeDirectedGraph(int *graph) { + for (int i = 0; i < nodes; i++) { + for (int j = 0; j < nodes; j++) { + int cell = i * nodes + j; + + if (i == j) { + graph[cell] = 0; + } else if (rand() % 2) { + graph[cell] = infinite; + } else { + graph[cell] = rand() % max_distance + 1; + } + } + } +} + +// Copy graph. +void CopyGraph(int *to, int *from) { + for (int i = 0; i < nodes; i++) { + for (int j = 0; j < nodes; j++) { + int cell = i * nodes + j; + to[cell] = from[cell]; + } + } +} + +// Check if two graphs are equal. +bool VerifyGraphsAreEqual(int *graph, int *h) { + for (int i = 0; i < nodes; i++) { + for (int j = 0; j < nodes; j++) { + int cell = i * nodes + j; + + if (graph[cell] != h[cell]) { + return false; + } + } + } + + return true; +} + +// The basic (sequential) implementation of Floyd Warshall algorithm for +// computing all pairs shortest paths. +void FloydWarshall(int *graph) { + for (int k = 0; k < nodes; k++) { + for (int i = 0; i < nodes; i++) { + for (int j = 0; j < nodes; j++) { + if (graph[i * nodes + j] > + graph[i * nodes + k] + graph[k * nodes + j]) { + graph[i * nodes + j] = graph[i * nodes + k] + graph[k * nodes + j]; + } + } + } + } +} + +typedef accessor + LocalBlock; + +// Inner loop of the blocked Floyd Warshall algorithm. A thread handles one cell +// of a block. To complete the computation of a block, this function is invoked +// by as many threads as there are cells in the block. Each such invocation +// computes as many iterations as there are blocks (along a single dimension). +// Moreover, each thread (simultaneously operating on a block), synchronizes +// between them at the end of each iteration. This is required for correctness +// as a following iteration depends on the previous iteration. +void BlockedFloydWarshallCompute(nd_item<1> &item, const LocalBlock &C, + const LocalBlock &A, const LocalBlock &B, + int i, int j) { + for (int k = 0; k < block_length; k++) { + if (C[i][j] > A[i][k] + B[k][j]) { + C[i][j] = A[i][k] + B[k][j]; + } + + item.barrier(access::fence_space::local_space); + } +} + +// Phase 1 of blocked Floyd Warshall algorithm. It always operates on a block +// on the diagonal of the adjacency matrix of the graph. +void BlockedFloydWarshallPhase1(queue &q, int *graph, int round) { + // Each group will process one block. + constexpr auto blocks = 1; + // Each item/thread in a group will handle one cell of the block. + constexpr auto block_size = block_length * block_length; + + q.submit([&](handler &h) { + LocalBlock block(range<2>(block_length, block_length), h); + + h.parallel_for( + nd_range<1>(blocks * block_size, block_size), [=](nd_item<1> item) { + auto tid = item.get_local_id(0); + auto i = tid / block_length; + auto j = tid % block_length; + + // Copy data to local memory. + block[i][j] = graph[(round * block_length + i) * nodes + + (round * block_length + j)]; + item.barrier(access::fence_space::local_space); + + // Compute. + BlockedFloydWarshallCompute(item, block, block, block, i, j); + + // Copy back data to global memory. + graph[(round * block_length + i) * nodes + + (round * block_length + j)] = block[i][j]; + item.barrier(access::fence_space::local_space); + }); + }); + + q.wait(); +} + +// Phase 2 of blocked Floyd Warshall algorithm. It always operates on blocks +// that are either on the same row or on the same column of a diagonal block. +void BlockedFloydWarshallPhase2(queue &q, int *graph, int round) { + // Each group will process one block. + constexpr auto blocks = block_count; + // Each item/thread in a group will handle one cell of the block. + constexpr auto block_size = block_length * block_length; + + q.submit([&](handler &h) { + LocalBlock diagonal(range<2>(block_length, block_length), h); + LocalBlock off_diag(range<2>(block_length, block_length), h); + + h.parallel_for( + nd_range<1>(blocks * block_size, block_size), [=](nd_item<1> item) { + auto gid = item.get_group(0); + auto index = gid; + + if (index != round) { + auto tid = item.get_local_id(0); + auto i = tid / block_length; + auto j = tid % block_length; + + // Copy data to local memory. + diagonal[i][j] = graph[(round * block_length + i) * nodes + + (round * block_length + j)]; + off_diag[i][j] = graph[(index * block_length + i) * nodes + + (round * block_length + j)]; + item.barrier(access::fence_space::local_space); + + // Compute for blocks above and below the diagonal block. + BlockedFloydWarshallCompute(item, off_diag, off_diag, diagonal, i, + j); + + // Copy back data to global memory. + graph[(index * block_length + i) * nodes + + (round * block_length + j)] = off_diag[i][j]; + + // Copy data to local memory. + off_diag[i][j] = graph[(round * block_length + i) * nodes + + (index * block_length + j)]; + item.barrier(access::fence_space::local_space); + + // Compute for blocks at left and at right of the diagonal block. + BlockedFloydWarshallCompute(item, off_diag, diagonal, off_diag, i, + j); + + // Copy back data to global memory. + graph[(round * block_length + i) * nodes + + (index * block_length + j)] = off_diag[i][j]; + item.barrier(access::fence_space::local_space); + } + }); + }); + + q.wait(); +} + +// Phase 3 of blocked Floyd Warshall algorithm. It operates on all blocks except +// the ones that are handled in phase 1 and in phase 2 of the algorithm. +void BlockedFloydWarshallPhase3(queue &q, int *graph, int round) { + // Each group will process one block. + constexpr auto blocks = block_count * block_count; + // Each item/thread in a group will handle one cell of the block. + constexpr auto block_size = block_length * block_length; + + q.submit([&](handler &h) { + LocalBlock A(range<2>(block_length, block_length), h); + LocalBlock B(range<2>(block_length, block_length), h); + LocalBlock C(range<2>(block_length, block_length), h); + + h.parallel_for( + nd_range<1>(blocks * block_size, block_size), [=](nd_item<1> item) { + auto bk = round; + + auto gid = item.get_group(0); + auto bi = gid / block_count; + auto bj = gid % block_count; + + if ((bi != bk) && (bj != bk)) { + auto tid = item.get_local_id(0); + auto i = tid / block_length; + auto j = tid % block_length; + + // Copy data to local memory. + A[i][j] = graph[(bi * block_length + i) * nodes + + (bk * block_length + j)]; + B[i][j] = graph[(bk * block_length + i) * nodes + + (bj * block_length + j)]; + C[i][j] = graph[(bi * block_length + i) * nodes + + (bj * block_length + j)]; + + item.barrier(access::fence_space::local_space); + + // Compute. + BlockedFloydWarshallCompute(item, C, A, B, i, j); + + // Copy back data to global memory. + graph[(bi * block_length + i) * nodes + (bj * block_length + j)] = + C[i][j]; + item.barrier(access::fence_space::local_space); + } + }); + }); + + q.wait(); +} + +// Parallel implementation of blocked Floyd Warshall algorithm. It has three +// phases. Given a prior round of these computation phases are complete, phase 1 +// is independent; Phase 2 can only execute after phase 1 completes; Similarly +// phase 3 depends on phase 2 so can only execute after phase 2 is complete. +// +// The inner loop of the sequential implementation is similar to: +// g[i][j] = min(g[i][j], g[i][k] + g[k][j]) +// A careful observation shows that for the kth iteration of the outer loop, +// the computation depends on cells either on the kth column, g[i][k] or on the +// kth row, g[k][j] of the graph. Phase 1 handles g[k][k], phase 2 handles +// g[*][k] and g[k][*], and phase 3 handles g[*][*] in that sequence. This cell +// level observations largely propagate to the blocks as well. +void BlockedFloydWarshall(queue &q, int *graph) { + for (int round = 0; round < block_count; round++) { + BlockedFloydWarshallPhase1(q, graph, round); + BlockedFloydWarshallPhase2(q, graph, round); + BlockedFloydWarshallPhase3(q, graph, round); + } +} + +int main() { + try { + queue q{default_selector{}, dpc_common::exception_handler}; + auto device = q.get_device(); + auto work_group_size = device.get_info(); + auto block_size = block_length * block_length; + + cout << "Device: " << device.get_info() << "\n"; + + if (work_group_size < block_size) { + cout << "Work group size " << work_group_size + << " is less than required size " << block_size << "\n"; + return -1; + } + + // Allocate unified shared memory so that graph data is accessible to both + // the CPU and the device (e.g., a GPU). + int *graph = (int *)malloc(sizeof(int) * nodes * nodes); + int *sequential = malloc_shared(nodes * nodes, q); + int *parallel = malloc_shared(nodes * nodes, q); + + if ((graph == nullptr) || (sequential == nullptr) || + (parallel == nullptr)) { + if (graph != nullptr) free(graph); + if (sequential != nullptr) free(sequential, q); + if (parallel != nullptr) free(parallel, q); + + cout << "Memory allocation failure.\n"; + return -1; + } + + // Initialize directed graph. + InitializeDirectedGraph(graph); + + // Warm up the JIT. + CopyGraph(parallel, graph); + BlockedFloydWarshall(q, parallel); + + // Measure execution times. + double elapsed_s = 0; + double elapsed_p = 0; + int i; + + cout << "Repeating computation " << repetitions + << " times to measure run time ...\n"; + + for (i = 0; i < repetitions; i++) { + cout << "Iteration: " << (i + 1) << "\n"; + + // Sequential all pairs shortest paths. + CopyGraph(sequential, graph); + + dpc_common::TimeInterval timer_s; + + FloydWarshall(sequential); + elapsed_s += timer_s.Elapsed(); + + // Parallel all pairs shortest paths. + CopyGraph(parallel, graph); + + dpc_common::TimeInterval timer_p; + + BlockedFloydWarshall(q, parallel); + elapsed_p += timer_p.Elapsed(); + + // Verify two results are equal. + if (!VerifyGraphsAreEqual(sequential, parallel)) { + cout << "Failed to correctly compute all pairs shortest paths!\n"; + break; + } + } + + if (i == repetitions) { + cout << "Successfully computed all pairs shortest paths in parallel!\n"; + + elapsed_s /= repetitions; + elapsed_p /= repetitions; + + cout << "Time sequential: " << elapsed_s << " sec\n"; + cout << "Time parallel: " << elapsed_p << " sec\n"; + } + + // Free unified shared memory. + free(graph); + free(sequential, q); + free(parallel, q); + } catch (std::exception const &e) { + cout << "An exception is caught while computing on device.\n"; + terminate(); + } + + return 0; +} diff --git a/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/CMakeLists.txt b/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/CMakeLists.txt new file mode 100644 index 0000000000..ce68388068 --- /dev/null +++ b/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/CMakeLists.txt @@ -0,0 +1,22 @@ +cmake_minimum_required (VERSION 3.5) +project (merge-spmv) + +set(CMAKE_CXX_COMPILER dpcpp) + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++17") + +add_executable(spmv src/spmv.cpp) + +if(WIN32) +add_custom_target(run spmv.exe) +else() +add_custom_target(run ./spmv) +endif() \ No newline at end of file diff --git a/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/License.txt b/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/License.txt new file mode 100644 index 0000000000..0abc121bd7 --- /dev/null +++ b/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/README.md b/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/README.md new file mode 100644 index 0000000000..c1c06face0 --- /dev/null +++ b/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/README.md @@ -0,0 +1,83 @@ +# Sparse Matrix Vector sample +`Sparse Matrix Vector sample` provides a parallel implementation of a merge based sparse matrix and vector multiplication algorithm using DPC++. + +For comprehensive instructions regarding DPC++ Programming, go to https://software.intel.com/en-us/oneapi-programming-guide and search based on relevant terms noted in the comments. + +| Optimized for | Description +|:--- |:--- +| OS | Linux Ubuntu 18.04, Windows 10 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++/C++ Compiler +| What you will learn | The Sparse Matrix Vector sample demonstrates the following using the Intel® oneAPI DPC++/C++ Compiler
  • Offloading compute intensive parts of the application using lambda kernel
  • Measuring kernel execution time
+| Time to complete | 15 minutes + +## Purpose +Sparse linear algebra algorithms are common in HPC, in fields as machine learning and computational science. In this sample, a merge based sparse matrix and vector multiplication algorithm is implemented. The input matrix is in compressed sparse row format. Use a parallel merge model enables the application to efficiently offload compute intensive operation to the GPU. + +## Key implementation details +The basic DPC++ implementation explained in the code includes device selector, unified shared memory, kernel, and command groups. + +Compressed Sparse Row (CSR) representation for sparse matrix have three components: +
    +
  • Nonzero values
  • +
  • Column indices
  • +
  • Row offsets
  • +
+ +Both row offsets and values indices can be thought of as sorted arrays. The progression of the computation is similar to that of merging two sorted arrays at a conceptual level. + +In parallel implementation, each thread independently identifies its scope of the merge and then performs only the amount of work that belongs this thread in the cohort of threads. + +## License +This code sample is licensed under MIT license + +## Building the Program for CPU and GPU + +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System + +Perform the following steps: + +1. Build the program using the following `cmake` commands. +``` + $ cd merge-spmv + $ mkdir build + $ cd build + $ cmake .. + $ make +``` + +2. Run the program +``` + $ make run + +``` + +### On a Windows* System Using Visual Studio* version 2017 or Newer + +* Build the program using VS2017 or VS2019: Right click on the solution file and open using either VS2017 or VS2019 IDE. Right click on the project in Solution explorer and select Rebuild. From top menu select Debug -> Start without Debugging. +* Build the program using MSBuild: Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for VS2019". Run - MSBuild merge-spmv.sln /t:Rebuild /p:Configuration="Release" + +## Running the sample + +### Example Output +``` +Device: Intel(R) Gen9 +Compute units: 24 +Work group size: 256 +Repeating 16 times to measure run time ... +Iteration: 1 +Iteration: 2 +Iteration: 3 +... +Iteration: 16 +Successfully completed sparse matrix and vector multiplication! +Time sequential: 0.00436269 sec +Time parallel: 0.00909913 sec + +``` diff --git a/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/merge-spmv.sln b/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/merge-spmv.sln new file mode 100644 index 0000000000..c7bcce49d5 --- /dev/null +++ b/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/merge-spmv.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.30104.148 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "spmv", "merge-spmv.vcxproj", "{BBCE2E87-CC7B-4AE7-A8A3-F8CDB90FD058}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {BBCE2E87-CC7B-4AE7-A8A3-F8CDB90FD058}.Debug|x64.ActiveCfg = Debug|x64 + {BBCE2E87-CC7B-4AE7-A8A3-F8CDB90FD058}.Debug|x64.Build.0 = Debug|x64 + {BBCE2E87-CC7B-4AE7-A8A3-F8CDB90FD058}.Release|x64.ActiveCfg = Release|x64 + {BBCE2E87-CC7B-4AE7-A8A3-F8CDB90FD058}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {561E55E9-42C6-4FA5-AAD0-8E80B36D7E7B} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/merge-spmv.vcxproj b/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/merge-spmv.vcxproj new file mode 100644 index 0000000000..4d69c37ab8 --- /dev/null +++ b/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/merge-spmv.vcxproj @@ -0,0 +1,87 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {bbce2e87-cc7b-4ae7-a8a3-f8cdb90fd058} + Win32Proj + merge_spmv + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + true + + + false + + + + + + + + Level3 + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + + + + + + + + + Level3 + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + + + + diff --git a/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/sample.json b/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/sample.json new file mode 100644 index 0000000000..a77bbc0e87 --- /dev/null +++ b/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/sample.json @@ -0,0 +1,29 @@ +{ + "guid": "C573751F-C04C-4EE0-A868-941EF460F944", + "name": "merge-spmv", + "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++/C++ Compiler/CPU and GPU" ], + "description": "merge-spmv using Intel® oneAPI DPC++ Language", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild merge-spmv.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "spmv.exe" + ] + }] + } + } diff --git a/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/src/spmv.cpp b/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/src/spmv.cpp new file mode 100644 index 0000000000..dd31f8b753 --- /dev/null +++ b/DirectProgramming/DPC++/SparseLinearAlgebra/merge-spmv/src/spmv.cpp @@ -0,0 +1,453 @@ +//============================================================== +// This sample provides a parallel implementation of a merge based sparse matrix +// and vector multiplication algorithm using DPC++. The input matrix is in +// compressed sparse row format. +//============================================================== +// Copyright © Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#include +#include +#include +#include + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp +#include "dpc_common.hpp" + +using namespace std; +using namespace sycl; + +// n x n sparse matrix. +constexpr int n = 100 * 1000; + +// Number of non zero values in sparse matrix. +constexpr int nonzero = 2 * 1000 * 1000; + +// Maximum value of an element in the matrix. +constexpr int max_value = 100; + +// Number of repetitions. +constexpr int repetitions = 16; + +// Compressed Sparse Row (CSR) representation for sparse matrix. +// +// Example: The following 4 x 4 sparse matrix +// +// a 0 0 0 +// b c 0 0 +// 0 0 0 d +// 0 0 e f +// +// have 6 non zero elements in it: +// +// Index Row Column Value +// 0 0 0 a +// 1 1 0 b +// 2 1 1 c +// 3 2 3 d +// 4 3 2 e +// 5 3 3 f +// +// Its CSR representation is have three components: +// - Nonzero values: a, b, c, d, e, f +// - Column indices: 0, 0, 1, 3, 2, 3 +// - Row offsets: 0, 1, 3, 4, 6 +// +// Non zero values and their column indices directly correspond to the entries +// in the above table. +// +// Row offsets are offsets in the values array for the first non zero element of +// each row of the matrix. +// +// Row NonZeros NonZeros_SeenBefore +// 0 1 0 +// 1 2 1 +// 2 1 3 +// 3 2 4 +// - - 6 +typedef struct { + int *row_offsets; + int *column_indices; + float *values; +} CompressedSparseRow; + +// Allocate unified shared memory for storing matrix and vectors so that they +// are accessible from both the CPU and the device (e.g., a GPU). +bool AllocateMemory(queue &q, int thread_count, CompressedSparseRow *matrix, + float **x, float **y_sequential, float **y_parallel, + int **carry_row, float **carry_value) { + matrix->row_offsets = malloc_shared(n + 1, q); + matrix->column_indices = malloc_shared(nonzero, q); + matrix->values = malloc_shared(nonzero, q); + + *x = malloc_shared(n, q); + *y_sequential = malloc_shared(n, q); + *y_parallel = malloc_shared(n, q); + + *carry_row = malloc_shared(thread_count, q); + *carry_value = malloc_shared(thread_count, q); + + return (matrix->row_offsets != nullptr) && + (matrix->column_indices != nullptr) && (matrix->values != nullptr) && + (*x != nullptr) && (*y_sequential != nullptr) && + (*y_parallel != nullptr) && (*carry_row != nullptr) && + (*carry_value != nullptr); +} + +// Free allocated unified shared memory. +void FreeMemory(queue &q, CompressedSparseRow *matrix, float *x, + float *y_sequential, float *y_parallel, int *carry_row, + float *carry_value) { + if (matrix->row_offsets != nullptr) free(matrix->row_offsets, q); + if (matrix->column_indices != nullptr) free(matrix->column_indices, q); + if (matrix->values != nullptr) free(matrix->values, q); + + if (x != nullptr) free(x, q); + if (y_sequential != nullptr) free(y_sequential, q); + if (y_parallel != nullptr) free(y_parallel, q); + + if (carry_row != nullptr) free(carry_row, q); + if (carry_value != nullptr) free(carry_value, q); +} + +// Initialize inputs: sparse matrix and vector. +void InitializeSparseMatrixAndVector(CompressedSparseRow *matrix, float *x) { + map> indices; + + // Randomly choose a set of elements (i.e., row and column pairs) of the + // matrix. These elements will have non zero values. + for (int k = 0; k < nonzero; k++) { + int i = rand() % n; + int j = rand() % n; + + if (indices.find(i) == indices.end()) { + indices[i] = {j}; + + } else if (indices[i].find(j) == indices[i].end()) { + indices[i].insert(j); + + } else { + k--; + } + } + + int offset = 0; + + // Randomly choose non zero values of the sparse matrix. + for (int i = 0; i < n; i++) { + matrix->row_offsets[i] = offset; + + if (indices.find(i) != indices.end()) { + set &cols = indices[i]; + + for (auto it = cols.cbegin(); it != cols.cend(); ++it, ++offset) { + matrix->column_indices[offset] = *it; + matrix->values[offset] = rand() % max_value + 1; + } + } + } + + matrix->row_offsets[n] = nonzero; + + // Initialize input vector. + for (int i = 0; i < n; i++) { + x[i] = 1; + } +} + +// A sequential implementation of merge based sparse matrix and vector +// multiplication algorithm. +// +// Both row offsets and values indices can be thought of as sorted arrays. The +// progression of the computation is similar to that of merging two sorted +// arrays at a conceptual level. +// +// When a row offset and an index of the values array are equal (denoted as '?' +// below), the algorithm starts computing the value of a new element of the +// result vector. +// +// The algorithm continues to accumulate for the same element of the result +// vector otherwise (denoted as '*' below). +// +// Row indices -> 0 1 2 3 +// Row offsets -> 0 1 3 4 6 +// +// ? 0 a +// ? 1 b +// * 2 c +// ? 3 d +// ? 4 e +// * 5 f +// +// ^ ^ +// | | +// | Non zero values +// | +// Indices of values array +void MergeSparseMatrixVector(CompressedSparseRow *matrix, float *x, float *y) { + int row_index = 0; + int val_index = 0; + + y[row_index] = 0; + + while (val_index < nonzero) { + if (val_index < matrix->row_offsets[row_index + 1]) { + // Accumulate and move down. + y[row_index] += + matrix->values[val_index] * x[matrix->column_indices[val_index]]; + val_index++; + + } else { + // Move right. + row_index++; + y[row_index] = 0; + } + } + + for (row_index++; row_index < n; row_index++) { + y[row_index] = 0; + } +} + +// Merge Coordinate. +typedef struct { + int row_index; + int val_index; +} MergeCoordinate; + +// Given linear position on the merge path, find two dimensional merge +// coordinate (row index and value index pair) on the path. +MergeCoordinate MergePathBinarySearch(int diagonal, int *row_offsets) { + // Diagonal search range (in row index space). + int row_min = std::max(diagonal - nonzero, 0); + int row_max = std::min(diagonal, n); + + // 2D binary search along the diagonal search range. + while (row_min < row_max) { + int pivot = (row_min + row_max) >> 1; + + if (row_offsets[pivot + 1] <= diagonal - pivot - 1) { + // Keep top right half of diagonal range. + row_min = pivot + 1; + } else { + // Keep bottom left half of diagonal range. + row_max = pivot; + } + } + + MergeCoordinate coordinate; + + coordinate.row_index = std::min(row_min, n); + coordinate.val_index = diagonal - row_min; + + return coordinate; +} + +// The parallel implementation of spare matrix, vector multiplication algorithm +// uses this function as a subroutine. Each available thread calls this function +// with identical inputs, except the thread identifier (TID) is unique. Having a +// unique TID, each thread independently identifies its own, non overlapping +// share of the overall work. More importantly, each thread, except possibly the +// last one, handles the same amount of work. This implementation is an +// extension of the sequential implementation of the merge based sparse matrix, +// vector multiplication algorithm. It first identifies its scope of the merge +// and then performs only the amount of work that belongs this thread in the +// cohort of threads. +void MergeSparseMatrixVectorThread(int thread_count, int tid, + CompressedSparseRow matrix, float *x, + float *y, int *carry_row, + float *carry_value) { + int path_length = n + nonzero; // Merge path length. + int items_per_thread = (path_length + thread_count - 1) / + thread_count; // Merge items per thread. + + // Find start and end merge path coordinates for this thread. + int diagonal = std::min(items_per_thread * tid, path_length); + int diagonal_end = std::min(diagonal + items_per_thread, path_length); + + MergeCoordinate path = MergePathBinarySearch(diagonal, matrix.row_offsets); + MergeCoordinate path_end = + MergePathBinarySearch(diagonal_end, matrix.row_offsets); + + // Consume items-per-thread merge items. + float dot_product = 0; + + for (int i = 0; i < items_per_thread; i++) { + if (path.val_index < matrix.row_offsets[path.row_index + 1]) { + // Accumulate and move down. + dot_product += matrix.values[path.val_index] * + x[matrix.column_indices[path.val_index]]; + path.val_index++; + + } else { + // Output row total and move right. + y[path.row_index] = dot_product; + dot_product = 0; + path.row_index++; + } + } + + // Save carry. + carry_row[tid] = path_end.row_index; + carry_value[tid] = dot_product; +} + +// This is the parallel implementation of merge based sparse matrix and vector +// mutiplication algorithm. It works in three steps: +// 1. Initialize elements of the output vector to zero. +// 2. Multiply sparse matrix and vector. +// 3. Fix up rows of the output vector that spanned across multiple threads. +// First two steps are parallel. They utilize all available processors +// (threads). The last step performs a reduction. It could be parallel as well +// but is kept as sequential for the following reasons: +// 1. Number of operation in this step is proportional to the number of +// processors (threads). +// 2. Number of available threads is not too high. +void MergeSparseMatrixVector(queue &q, int compute_units, int work_group_size, + CompressedSparseRow matrix, float *x, float *y, + int *carry_row, float *carry_value) { + int thread_count = compute_units * work_group_size; + + // Initialize output vector. + q.parallel_for( + nd_range<1>(compute_units * work_group_size, work_group_size), + [=](nd_item<1> item) { + auto global_id = item.get_global_id(0); + auto items_per_thread = (n + thread_count - 1) / thread_count; + auto start = global_id * items_per_thread; + auto stop = start + items_per_thread; + + for (auto i = start; (i < stop) && (i < n); i++) { + y[i] = 0; + } + }); + + q.wait(); + + // Multiply sparse matrix and vector. + q.parallel_for( + nd_range<1>(compute_units * work_group_size, work_group_size), + [=](nd_item<1> item) { + auto global_id = item.get_global_id(0); + MergeSparseMatrixVectorThread(thread_count, global_id, matrix, x, y, + carry_row, carry_value); + }); + + q.wait(); + + // Carry fix up for rows spanning multiple threads. + for (int tid = 0; tid < thread_count - 1; tid++) { + if (carry_row[tid] < n) { + y[carry_row[tid]] += carry_value[tid]; + } + } +} + +// Check if two input vectors are equal. +bool VerifyVectorsAreEqual(float *u, float *v) { + for (int i = 0; i < n; i++) { + if (fabs(u[i] - v[i]) > 1E-06) { + return false; + } + } + + return true; +} + +int main() { + // Sparse matrix. + CompressedSparseRow matrix; + + // Input vector. + float *x; + + // Vector: result of sparse matrix and vector multiplication. + float *y_sequential; + float *y_parallel; + + // Auxiliary storage for parallel computation. + int *carry_row; + float *carry_value; + + try { + queue q{default_selector{}, dpc_common::exception_handler}; + auto device = q.get_device(); + + cout << "Device: " << device.get_info() << "\n"; + + // Find max number of compute/execution units and max number of threads per + // compute unit. + auto compute_units = device.get_info(); + auto work_group_size = device.get_info(); + + cout << "Compute units: " << compute_units << "\n"; + cout << "Work group size: " << work_group_size << "\n"; + + // Allocate memory. + if (!AllocateMemory(q, compute_units * work_group_size, &matrix, &x, + &y_sequential, &y_parallel, &carry_row, &carry_value)) { + cout << "Memory allocation failure.\n"; + FreeMemory(q, &matrix, x, y_sequential, y_parallel, carry_row, + carry_value); + return -1; + } + + // Initialize. + InitializeSparseMatrixAndVector(&matrix, x); + + // Warm up the JIT. + MergeSparseMatrixVector(q, compute_units, work_group_size, matrix, x, + y_parallel, carry_row, carry_value); + + // Time executions. + double elapsed_s = 0; + double elapsed_p = 0; + int i; + + cout << "Repeating " << repetitions << " times to measure run time ...\n"; + + for (i = 0; i < repetitions; i++) { + cout << "Iteration: " << (i + 1) << "\n"; + + // Sequential compute. + dpc_common::TimeInterval timer_s; + + MergeSparseMatrixVector(&matrix, x, y_sequential); + elapsed_s += timer_s.Elapsed(); + + // Parallel compute. + dpc_common::TimeInterval timer_p; + + MergeSparseMatrixVector(q, compute_units, work_group_size, matrix, x, + y_parallel, carry_row, carry_value); + elapsed_p += timer_p.Elapsed(); + + // Verify two results are equal. + if (!VerifyVectorsAreEqual(y_sequential, y_parallel)) { + cout << "Failed to correctly compute!\n"; + break; + } + } + + if (i == repetitions) { + cout << "Successfully completed sparse matrix and vector " + "multiplication!\n"; + + elapsed_s /= repetitions; + elapsed_p /= repetitions; + + cout << "Time sequential: " << elapsed_s << " sec\n"; + cout << "Time parallel: " << elapsed_p << " sec\n"; + } + + FreeMemory(q, &matrix, x, y_sequential, y_parallel, carry_row, carry_value); + } catch (std::exception const &e) { + cout << "An exception is caught while computing on device.\n"; + terminate(); + } + + return 0; +}