Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
8a471f1
Update simple add sample
MoushumiMaria Jul 2, 2020
abc813c
Update make files
MoushumiMaria Jul 6, 2020
3914963
Update fpga make file
MoushumiMaria Jul 6, 2020
1ba72da
Add dpc_common.hpp
MoushumiMaria Jul 8, 2020
01df0de
Update sample.json
MoushumiMaria Jul 8, 2020
2ba6714
Fix Makefile.win
MoushumiMaria Jul 8, 2020
a5d1e34
Update Makefile.win
MoushumiMaria Jul 8, 2020
8548c6c
Update sample.json
MoushumiMaria Jul 9, 2020
66f2bf1
Merge branch 'master' of https://github.com/moushumi-maria/oneAPI-sam…
MoushumiMaria Jul 9, 2020
0adf6af
Remove dpc_common.hpp
MoushumiMaria Jul 9, 2020
c924d40
Update VS project file
MoushumiMaria Jul 14, 2020
77f12e4
Update README.md
MoushumiMaria Jul 14, 2020
706c75d
Merge branch 'master' into master
MoushumiMaria Jul 14, 2020
ed1fabb
Update sample.json
MoushumiMaria Jul 14, 2020
f0a5d55
Merge branch 'master' of https://github.com/oneapi-src/oneAPI-samples
MoushumiMaria Jul 17, 2020
95aeaae
Add stb
MoushumiMaria Jul 17, 2020
f077919
Update read me file
MoushumiMaria Jul 17, 2020
b5bd116
Initial commit
MoushumiMaria Jul 24, 2020
7ef9c5e
Update License.txt
JoeOster Jul 27, 2020
3336cc5
Change location of matrix multiplication sample
MoushumiMaria Jul 27, 2020
1819f7b
Fix matrix mul sample VS project file
MoushumiMaria Sep 4, 2020
153873c
Merge branch 'master' into master
MoushumiMaria Sep 4, 2020
0281c72
Merge branch 'master' of https://github.com/oneapi-src/oneAPI-samples
MoushumiMaria Sep 29, 2020
b8b3228
Update samples for beta10 release
MoushumiMaria Oct 2, 2020
168fe1e
Fix for Windows
MoushumiMaria Oct 2, 2020
7edeedf
Fix for FPGA
MoushumiMaria Oct 2, 2020
0f8f5a3
Fix for FPGA
MoushumiMaria Oct 2, 2020
34d163b
Fix for FPGA to support both beta09 and beta10
MoushumiMaria Oct 2, 2020
e0c09ef
Add header comment
MoushumiMaria Oct 2, 2020
9e1ab37
Samples: block apsp and merge spmv
MoushumiMaria Oct 5, 2020
fb6a426
Add readme files
MoushumiMaria Oct 6, 2020
c0827a4
Update readme file
MoushumiMaria Oct 6, 2020
60a4474
Update sample.json
MoushumiMaria Oct 6, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ int main() {

// Create 2D buffers for matrices, buffer c is bound with host memory c_back

buffer<float, 2> a(range(M, N));
buffer<float, 2> b(range(N, P));
buffer c(reinterpret_cast<float *>(c_back), range(M, P));
buffer<float, 2> a_buf(range(M, N));
buffer<float, 2> b_buf(range(N, P));
buffer c_buf(reinterpret_cast<float *>(c_back), range(M, P));

cout << "Problem size: c(" << M << "," << P << ") = a(" << M << "," << N
<< ") * b(" << N << "," << P << ")\n";
Expand All @@ -71,40 +71,40 @@ int main() {
// execution ordering.

// Submit command group to queue to initialize matrix a
q.submit([&](handler &h) {
q.submit([&](auto &h) {
// Get write only access to the buffer on a device.
auto accessor = a.get_access<access::mode::write>(h);
accessor a(a_buf, h, write_only);

// Execute kernel.
h.parallel_for(range(M, N), [=](id<2> index) {
h.parallel_for(range(M, N), [=](auto index) {
// Each element of matrix a is 1.
accessor[index] = 1.0f;
a[index] = 1.0f;
});
});

// Submit command group to queue to initialize matrix b
q.submit([&](handler &h) {
q.submit([&](auto &h) {
// Get write only access to the buffer on a device
auto accessor = b.get_access<access::mode::write>(h);
accessor b(b_buf, h, write_only);

// Execute kernel.
h.parallel_for(range(N, P), [=](id<2> index) {
h.parallel_for(range(N, P), [=](auto index) {
// Each column of b is the sequence 1,2,...,N
accessor[index] = index[0] + 1.0f;
b[index] = index[0] + 1.0f;
});
});

// Submit command group to queue to multiply matrices: c = a * b
q.submit([&](handler &h) {
q.submit([&](auto &h) {
// Read from a and b, write to c
auto A = a.get_access<access::mode::read>(h);
auto B = b.get_access<access::mode::read>(h);
auto C = c.get_access<access::mode::write>(h);
accessor A(a_buf, h, read_only);
accessor B(b_buf, h, read_only);
accessor C(c_buf, h, write_only);

int width_a = a.get_range()[1];
int width_a = a_buf.get_range()[1];

// Execute kernel.
h.parallel_for(range(M, P), [=](id<2> index) {
h.parallel_for(range(M, P), [=](auto index) {
// Get global position in Y direction.
int row = index[0];
// Get global position in X direction.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,21 @@
#include <CL/sycl.hpp>
#include <array>
#include <iostream>

// dpc_common.hpp can be found in the dev-utilities include folder.
// e.g., $ONEAPI_ROOT/dev-utilities/<version>/include/dpc_common.hpp
#include "dpc_common.hpp"

#if FPGA || FPGA_EMULATOR
#include <CL/sycl/intel/fpga_extensions.hpp>
// Header locations and some DPC++ extensions changed between beta09 and beta10
// Temporarily modify the code sample to accept either version
#define BETA09 20200827
#if __SYCL_COMPILER_VERSION <= BETA09
#include <CL/sycl/intel/fpga_extensions.hpp>
namespace INTEL = sycl::intel; // Namespace alias for backward compatibility
#else
#include <CL/sycl/INTEL/fpga_extensions.hpp>
#endif
#endif

using namespace sycl;
Expand All @@ -39,26 +51,26 @@ typedef array<int, array_size> IntArray;
//************************************
void IotaParallel(queue &q, IntArray &a_array, int value) {
// Create the range object for the array managed by the buffer.
range<1> num_items{a_array.size()};
range num_items{a_array.size()};

// Create buffer that hold the data shared between the host and the devices.
// The buffer destructor is responsible to copy the data back to host when it
// goes out of scope.
buffer a_buf(a_array.data(), num_items);
buffer a_buf(a_array);

// Submit a command group to the queue by a lambda function that contains the
// data access permission and device computation (kernel).
q.submit([&](handler &h) {
q.submit([&](auto &h) {
// Create an accessor with write permission.
auto a = a_buf.get_access<access::mode::write>(h);
accessor a(a_buf, h, write_only);

// Use parallel_for to populate consecutive numbers starting with a
// specified value in parallel on device. This executes the kernel.
// 1st parameter is the number of work items to use.
// 2nd parameter is the kernel, a lambda that specifies what to do per
// work item. The parameter of the lambda is the work item id.
// DPC++ supports unnamed lambda kernel by default.
h.parallel_for(num_items, [=](id<1> i) { a[i] = value + i; });
h.parallel_for(num_items, [=](auto i) { a[i] = value + i; });
});
}

Expand All @@ -69,10 +81,10 @@ int main() {
// Create device selector for the device of your interest.
#if FPGA_EMULATOR
// DPC++ extension: FPGA emulator selector on systems without FPGA card.
intel::fpga_emulator_selector d_selector;
INTEL::fpga_emulator_selector d_selector;
#elif FPGA
// DPC++ extension: FPGA selector on systems with FPGA card.
intel::fpga_selector d_selector;
INTEL::fpga_selector d_selector;
#else
// The default device selector will select the most performant device.
default_selector d_selector;
Expand All @@ -96,8 +108,8 @@ int main() {
// Parallel iota in DPC++.
IotaParallel(q, parallel, value);
} catch (std::exception const &e) {
cout << "An exception is caught while computing on device.\n";
terminate();
cout << "An exception is caught while computing on device.\n";
terminate();
}

// Verify two results are equal.
Expand All @@ -114,7 +126,7 @@ int main() {
// Print out iota result.
for (int i = 0; i < indices_size; i++) {
int j = indices[i];
if (i == indices_size - 1) std::cout << "...\n";
if (i == indices_size - 1) cout << "...\n";
cout << "[" << j << "]: " << j << " + " << value << " = "
<< parallel[j] << "\n";
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,21 @@
#include <CL/sycl.hpp>
#include <array>
#include <iostream>

// dpc_common.hpp can be found in the dev-utilities include folder.
// e.g., $ONEAPI_ROOT/dev-utilities/<version>/include/dpc_common.hpp
#include "dpc_common.hpp"

#if FPGA || FPGA_EMULATOR
#include <CL/sycl/intel/fpga_extensions.hpp>
// Header locations and some DPC++ extensions changed between beta09 and beta10
// Temporarily modify the code sample to accept either version
#define BETA09 20200827
#if __SYCL_COMPILER_VERSION <= BETA09
#include <CL/sycl/intel/fpga_extensions.hpp>
namespace INTEL = sycl::intel; // Namespace alias for backward compatibility
#else
#include <CL/sycl/INTEL/fpga_extensions.hpp>
#endif
#endif

using namespace sycl;
Expand All @@ -38,15 +50,15 @@ constexpr size_t array_size = 10000;
//************************************
void IotaParallel(queue &q, int *a, size_t size, int value) {
// Create the range object for the array.
range<1> num_items{size};
range num_items{size};

// Use parallel_for to populate consecutive numbers starting with a specified
// value in parallel on device. This executes the kernel.
// 1st parameter is the number of work items to use.
// 2nd parameter is the kernel, a lambda that specifies what to do per
// work item. The parameter of the lambda is the work item id.
// DPC++ supports unnamed lambda kernel by default.
auto e = q.parallel_for(num_items, [=](id<1> i) { a[i] = value + i; });
auto e = q.parallel_for(num_items, [=](auto i) { a[i] = value + i; });

// q.parallel_for() is an asynchronous call. DPC++ runtime enqueues and runs
// the kernel asynchronously. Wait for the asynchronous call to complete.
Expand All @@ -60,10 +72,10 @@ int main() {
// Create device selector for the device of your interest.
#if FPGA_EMULATOR
// DPC++ extension: FPGA emulator selector on systems without FPGA card.
intel::fpga_emulator_selector d_selector;
INTEL::fpga_emulator_selector d_selector;
#elif FPGA
// DPC++ extension: FPGA selector on systems with FPGA card.
intel::fpga_selector d_selector;
INTEL::fpga_selector d_selector;
#else
// The default device selector will select the most performant device.
default_selector d_selector;
Expand Down Expand Up @@ -110,16 +122,16 @@ int main() {
// Print out iota result.
for (int i = 0; i < indices_size; i++) {
int j = indices[i];
if (i == indices_size - 1) std::cout << "...\n";
cout << "[" << j << "]: " << j << " + " << value << " = "
if (i == indices_size - 1) cout << "...\n";
cout << "[" << j << "]: " << j << " + " << value << " = "
<< sequential[j] << "\n";
}

free(sequential, q);
free(parallel, q);
} catch (std::exception const &e) {
cout << "An exception is caught while computing on device.\n";
terminate();
cout << "An exception is caught while computing on device.\n";
terminate();
}

cout << "Successfully completed on device.\n";
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
cmake_minimum_required (VERSION 3.5)
project (all-pairs-shortest-paths)

set(CMAKE_CXX_COMPILER dpcpp)


# Set default build type to RelWithDebInfo if not specified
if (NOT CMAKE_BUILD_TYPE)
message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info")
set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE
STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel"
FORCE)
endif()

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++17")

add_executable(apsp src/apsp.cpp)

if(WIN32)
add_custom_target(run apsp.exe)
else()
add_custom_target(run ./apsp)
endif()
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Copyright Intel Corporation

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# All Pairs Shortest Paths sample
`All Pairs Shortest Paths` uses the Floyd Warshall algorithm to find the shortest paths between pairs of vertices in a graph. It uses a parallel blocked algorithm that enables the application to efficiently offload compute intensive work to the GPU.

For comprehensive instructions regarding DPC++ Programming, go to https://software.intel.com/en-us/oneapi-programming-guide and search based on relevant terms noted in the comments.

| Optimized for | Description
|:--- |:---
| OS | Linux Ubuntu 18.04, Windows 10
| Hardware | Skylake with GEN9 or newer
| Software | Intel&reg; oneAPI DPC++/C++ Compiler
| What you will learn | The All Pairs Shortest Paths sample demonstrates the following using the Intel&reg; oneAPI DPC++/C++ Compiler <ul><li>Offloading compute intensive parts of the application using lambda kernel</li><li>Measuring kernel execution time</li></ul>
| Time to complete | 15 minutes

## Purpose
This sample uses blocked Floyd-Warshall all pairs shortest paths algorithm to compute a matrix that represents the minimum distance from any node to all other nodes in the graph. Using parallel blocked processing, blocks can be calculated simultaneously by distributing task computations to the GPU.

## Key implementation details
The basic DPC++ implementation explained in the code includes device selector, unified shared memory, kernel, and command groups.

The parallel implementation of blocked Floyd Warshall algorithm has three phases. Given a prior round of these computation phases are complete, phase 1 is independent; Phase 2 can only execute after phase 1 completes; Similarly phase 3 depends on phase 2 so can only execute after phase 2 is complete.

The inner loop of the sequential implementation is:
g[i][j] = min(g[i][j], g[i][k] + g[k][j])

A careful observation shows that for the kth iteration of the outer loop, the computation depends on cells either on the kth column, g[i][k] or on the kth row, g[k][j] of the graph. Phase 1 handles g[k][k], phase 2 handles g[\*][k] and g[k][\*], and phase 3 handles g[\*][\*] in that sequence. This cell level observations largely propagate to the blocks as well.

In each phase computation within a block can proceed independently in parallel.

## License
This code sample is licensed under MIT license

## Building the Program for CPU and GPU

### Include Files
The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.

### Running Samples in DevCloud
If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, FPGA) as well whether to run in batch or interactive mode. For more information see the Intel&reg; oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/)

### On a Linux* System

Perform the following steps:

1. Build the program using the following <code> cmake </code> commands.
```
$ cd all-pairs-shortest-paths
$ mkdir build
$ cd build
$ cmake ..
$ make
```

2. Run the program <br>
```
$ make run

```

### On a Windows* System Using Visual Studio* version 2017 or Newer

* Build the program using VS2017 or VS2019: Right click on the solution file and open using either VS2017 or VS2019 IDE. Right click on the project in Solution explorer and select Rebuild. From top menu select Debug -> Start without Debugging.
* Build the program using MSBuild: Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for VS2019". Run - MSBuild all-pairs-shortest-paths.sln /t:Rebuild /p:Configuration="Release"

## Running the sample

### Example Output
```
Device: Intel(R) Gen9
Repeating computation 8 times to measure run time ...
Iteration: 1
Iteration: 2
Iteration: 3
...
Iteration: 8
Successfully computed all pairs shortest paths in parallel!
Time sequential: 0.583029 sec
Time parallel: 0.159223 sec

```
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.30104.148
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "apsp", "all-pairs-shortest-paths.vcxproj", "{28C45A93-5D80-4635-BAFB-D5386EEEA466}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{28C45A93-5D80-4635-BAFB-D5386EEEA466}.Debug|x64.ActiveCfg = Debug|x64
{28C45A93-5D80-4635-BAFB-D5386EEEA466}.Debug|x64.Build.0 = Debug|x64
{28C45A93-5D80-4635-BAFB-D5386EEEA466}.Release|x64.ActiveCfg = Release|x64
{28C45A93-5D80-4635-BAFB-D5386EEEA466}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {2ADBAE67-858D-4FA7-B81F-3C9AF9347EFA}
EndGlobalSection
EndGlobal
Loading