oneapi-src · JoeOster · Oct 7, 2020 · Jul 2, 2020 · Jul 6, 2020 · Jul 6, 2020
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/src/matrix_mul_dpcpp.cpp b/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/src/matrix_mul_dpcpp.cpp
@@ -58,9 +58,9 @@ int main() {
 
     // Create 2D buffers for matrices, buffer c is bound with host memory c_back
 
-    buffer<float, 2> a(range(M, N));
-    buffer<float, 2> b(range(N, P));
-    buffer c(reinterpret_cast<float *>(c_back), range(M, P));
+    buffer<float, 2> a_buf(range(M, N));
+    buffer<float, 2> b_buf(range(N, P));
+    buffer c_buf(reinterpret_cast<float *>(c_back), range(M, P));
 
     cout << "Problem size: c(" << M << "," << P << ") = a(" << M << "," << N
          << ") * b(" << N << "," << P << ")\n";
@@ -71,40 +71,40 @@ int main() {
     // execution ordering.
 
     // Submit command group to queue to initialize matrix a
-    q.submit([&](handler &h) {
+    q.submit([&](auto &h) {
       // Get write only access to the buffer on a device.
-      auto accessor = a.get_access<access::mode::write>(h);
+      accessor a(a_buf, h, write_only);
 
       // Execute kernel.
-      h.parallel_for(range(M, N), [=](id<2> index) {
+      h.parallel_for(range(M, N), [=](auto index) {
         // Each element of matrix a is 1.
-        accessor[index] = 1.0f;
+        a[index] = 1.0f;
       });
     });
 
     // Submit command group to queue to initialize matrix b
-    q.submit([&](handler &h) {
+    q.submit([&](auto &h) {
       // Get write only access to the buffer on a device
-      auto accessor = b.get_access<access::mode::write>(h);
+      accessor b(b_buf, h, write_only);
 
       // Execute kernel.
-      h.parallel_for(range(N, P), [=](id<2> index) {
+      h.parallel_for(range(N, P), [=](auto index) {
         // Each column of b is the sequence 1,2,...,N
-        accessor[index] = index[0] + 1.0f;
+        b[index] = index[0] + 1.0f;
       });
     });
 
     // Submit command group to queue to multiply matrices: c = a * b
-    q.submit([&](handler &h) {
+    q.submit([&](auto &h) {
       // Read from a and b, write to c
-      auto A = a.get_access<access::mode::read>(h);
-      auto B = b.get_access<access::mode::read>(h);
-      auto C = c.get_access<access::mode::write>(h);
+      accessor A(a_buf, h, read_only);
+      accessor B(b_buf, h, read_only);
+      accessor C(c_buf, h, write_only);
 
-      int width_a = a.get_range()[1];
+      int width_a = a_buf.get_range()[1];
 
       // Execute kernel.
-      h.parallel_for(range(M, P), [=](id<2> index) {
+      h.parallel_for(range(M, P), [=](auto index) {
         // Get global position in Y direction.
         int row = index[0];
         // Get global position in X direction.

diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-buffers.cpp b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-buffers.cpp
@@ -22,9 +22,21 @@
 #include <CL/sycl.hpp>
 #include <array>
 #include <iostream>
+
+// dpc_common.hpp can be found in the dev-utilities include folder.
+// e.g., $ONEAPI_ROOT/dev-utilities/<version>/include/dpc_common.hpp
 #include "dpc_common.hpp"
+
 #if FPGA || FPGA_EMULATOR
-#include <CL/sycl/intel/fpga_extensions.hpp>
+// Header locations and some DPC++ extensions changed between beta09 and beta10
+// Temporarily modify the code sample to accept either version
+#define BETA09 20200827
+#if __SYCL_COMPILER_VERSION <= BETA09
+  #include <CL/sycl/intel/fpga_extensions.hpp>
+  namespace INTEL = sycl::intel;  // Namespace alias for backward compatibility
+#else
+  #include <CL/sycl/INTEL/fpga_extensions.hpp>
+#endif
 #endif
 
 using namespace sycl;
@@ -39,26 +51,26 @@ typedef array<int, array_size> IntArray;
 //************************************
 void IotaParallel(queue &q, IntArray &a_array, int value) {
   // Create the range object for the array managed by the buffer.
-  range<1> num_items{a_array.size()};
+  range num_items{a_array.size()};
 
   // Create buffer that hold the data shared between the host and the devices.
   // The buffer destructor is responsible to copy the data back to host when it
   // goes out of scope.
-  buffer a_buf(a_array.data(), num_items);
+  buffer a_buf(a_array);
 
   // Submit a command group to the queue by a lambda function that contains the
   // data access permission and device computation (kernel).
-  q.submit([&](handler &h) {
+  q.submit([&](auto &h) {
     // Create an accessor with write permission.
-    auto a = a_buf.get_access<access::mode::write>(h);
+    accessor a(a_buf, h, write_only);
 
     // Use parallel_for to populate consecutive numbers starting with a
     // specified value in parallel on device. This executes the kernel.
     //    1st parameter is the number of work items to use.
     //    2nd parameter is the kernel, a lambda that specifies what to do per
     //    work item. The parameter of the lambda is the work item id.
     // DPC++ supports unnamed lambda kernel by default.
-    h.parallel_for(num_items, [=](id<1> i) { a[i] = value + i; });
+    h.parallel_for(num_items, [=](auto i) { a[i] = value + i; });
   });
 }
 
@@ -69,10 +81,10 @@ int main() {
   // Create device selector for the device of your interest.
 #if FPGA_EMULATOR
   // DPC++ extension: FPGA emulator selector on systems without FPGA card.
-  intel::fpga_emulator_selector d_selector;
+  INTEL::fpga_emulator_selector d_selector;
 #elif FPGA
   // DPC++ extension: FPGA selector on systems with FPGA card.
-  intel::fpga_selector d_selector;
+  INTEL::fpga_selector d_selector;
 #else
   // The default device selector will select the most performant device.
   default_selector d_selector;
@@ -96,8 +108,8 @@ int main() {
     // Parallel iota in DPC++.
     IotaParallel(q, parallel, value);
   } catch (std::exception const &e) {
-      cout << "An exception is caught while computing on device.\n";
-      terminate();
+    cout << "An exception is caught while computing on device.\n";
+    terminate();
   }
 
   // Verify two results are equal.
@@ -114,7 +126,7 @@ int main() {
   // Print out iota result.
   for (int i = 0; i < indices_size; i++) {
     int j = indices[i];
-    if (i == indices_size - 1) std::cout << "...\n";
+    if (i == indices_size - 1) cout << "...\n";
     cout << "[" << j << "]: " << j << " + " << value << " = "
          << parallel[j] << "\n";
   }

diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-usm.cpp b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-usm.cpp
@@ -22,9 +22,21 @@
 #include <CL/sycl.hpp>
 #include <array>
 #include <iostream>
+
+// dpc_common.hpp can be found in the dev-utilities include folder.
+// e.g., $ONEAPI_ROOT/dev-utilities/<version>/include/dpc_common.hpp
 #include "dpc_common.hpp"
+
 #if FPGA || FPGA_EMULATOR
-#include <CL/sycl/intel/fpga_extensions.hpp>
+// Header locations and some DPC++ extensions changed between beta09 and beta10
+// Temporarily modify the code sample to accept either version
+#define BETA09 20200827
+#if __SYCL_COMPILER_VERSION <= BETA09
+  #include <CL/sycl/intel/fpga_extensions.hpp>
+  namespace INTEL = sycl::intel;  // Namespace alias for backward compatibility
+#else
+  #include <CL/sycl/INTEL/fpga_extensions.hpp>
+#endif
 #endif
 
 using namespace sycl;
@@ -38,15 +50,15 @@ constexpr size_t array_size = 10000;
 //************************************
 void IotaParallel(queue &q, int *a, size_t size, int value) {
   // Create the range object for the array.
-  range<1> num_items{size};
+  range num_items{size};
 
   // Use parallel_for to populate consecutive numbers starting with a specified
   // value in parallel on device. This executes the kernel.
   //    1st parameter is the number of work items to use.
   //    2nd parameter is the kernel, a lambda that specifies what to do per
   //    work item. The parameter of the lambda is the work item id.
   // DPC++ supports unnamed lambda kernel by default.
-  auto e = q.parallel_for(num_items, [=](id<1> i) { a[i] = value + i; });
+  auto e = q.parallel_for(num_items, [=](auto i) { a[i] = value + i; });
 
   // q.parallel_for() is an asynchronous call. DPC++ runtime enqueues and runs
   // the kernel asynchronously. Wait for the asynchronous call to complete.
@@ -60,10 +72,10 @@ int main() {
   // Create device selector for the device of your interest.
 #if FPGA_EMULATOR
   // DPC++ extension: FPGA emulator selector on systems without FPGA card.
-  intel::fpga_emulator_selector d_selector;
+  INTEL::fpga_emulator_selector d_selector;
 #elif FPGA
   // DPC++ extension: FPGA selector on systems with FPGA card.
-  intel::fpga_selector d_selector;
+  INTEL::fpga_selector d_selector;
 #else
   // The default device selector will select the most performant device.
   default_selector d_selector;
@@ -110,16 +122,16 @@ int main() {
     // Print out iota result.
     for (int i = 0; i < indices_size; i++) {
       int j = indices[i];
-      if (i == indices_size - 1) std::cout << "...\n";
-      cout << "[" << j << "]: " << j << " + " << value << " = " 
+      if (i == indices_size - 1) cout << "...\n";
+      cout << "[" << j << "]: " << j << " + " << value << " = "
            << sequential[j] << "\n";
     }
 
     free(sequential, q);
     free(parallel, q);
   } catch (std::exception const &e) {
-      cout << "An exception is caught while computing on device.\n";
-      terminate();
+    cout << "An exception is caught while computing on device.\n";
+    terminate();
   }
 
   cout << "Successfully completed on device.\n";

diff --git a/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/CMakeLists.txt b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/CMakeLists.txt
@@ -0,0 +1,23 @@
+cmake_minimum_required (VERSION 3.5)
+project (all-pairs-shortest-paths)
+
+set(CMAKE_CXX_COMPILER dpcpp)
+
+
+# Set default build type to RelWithDebInfo if not specified
+if (NOT CMAKE_BUILD_TYPE)
+    message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info")
+    set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE
+        STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel"
+        FORCE)
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -std=c++17")
+
+add_executable(apsp src/apsp.cpp)
+
+if(WIN32)
+add_custom_target(run apsp.exe)
+else()
+add_custom_target(run ./apsp)
+endif()
diff --git a/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/License.txt b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/README.md b/DirectProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/README.md
@@ -0,0 +1,79 @@
+# All Pairs Shortest Paths sample
+`All Pairs Shortest Paths` uses the Floyd Warshall algorithm to find the shortest paths between pairs of vertices in a graph. It uses a parallel blocked algorithm that enables the application to efficiently offload compute intensive work to the GPU.
+
+For comprehensive instructions regarding DPC++ Programming, go to https://software.intel.com/en-us/oneapi-programming-guide and search based on relevant terms noted in the comments.
+
+| Optimized for                     | Description
+|:---                               |:---
+| OS                                | Linux Ubuntu 18.04, Windows 10
+| Hardware                          | Skylake with GEN9 or newer
+| Software                          | Intel&reg; oneAPI DPC++/C++ Compiler
+| What you will learn               | The All Pairs Shortest Paths sample demonstrates the following using the Intel&reg; oneAPI DPC++/C++ Compiler <ul><li>Offloading compute intensive parts of the application using lambda kernel</li><li>Measuring kernel execution time</li></ul>
+| Time to complete                  | 15 minutes
+
+## Purpose
+This sample uses blocked Floyd-Warshall all pairs shortest paths algorithm to compute a matrix that represents the minimum distance from any node to all other nodes in the graph. Using parallel blocked processing, blocks can be calculated simultaneously by distributing task computations to the GPU.
+
+## Key implementation details
+The basic DPC++ implementation explained in the code includes device selector, unified shared memory, kernel, and command groups.
+
+The parallel implementation of blocked Floyd Warshall algorithm has three phases. Given a prior round of these computation phases are complete, phase 1 is independent; Phase 2 can only execute after phase 1 completes; Similarly phase 3 depends on phase 2 so can only execute after phase 2 is complete.
+
+The inner loop of the sequential implementation is:
+  g[i][j] = min(g[i][j], g[i][k] + g[k][j])
+
+A careful observation shows that for the kth iteration of the outer loop, the computation depends on cells either on the kth column, g[i][k] or on the kth row, g[k][j] of the graph. Phase 1 handles g[k][k], phase 2 handles g[\*][k] and g[k][\*], and phase 3 handles g[\*][\*] in that sequence. This cell level observations largely propagate to the blocks as well.
+
+In each phase computation within a block can proceed independently in parallel.
+
+## License  
+This code sample is licensed under MIT license 
+
+## Building the Program for CPU and GPU
+
+### Include Files
+The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, FPGA) as well whether to run in batch or interactive mode. For more information see the Intel&reg; oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/)
+
+### On a Linux* System
+
+Perform the following steps:
+
+1.  Build the program using the following <code> cmake </code> commands.
+```
+    $ cd all-pairs-shortest-paths
+    $ mkdir build
+    $ cd build
+    $ cmake ..
+    $ make
+```
+
+2.  Run the program <br>
+```
+    $ make run
+
+```
+
+### On a Windows* System Using Visual Studio* version 2017 or Newer
+
+* Build the program using VS2017 or VS2019: Right click on the solution file and open using either VS2017 or VS2019 IDE. Right click on the project in Solution explorer and select Rebuild. From top menu select Debug -> Start without Debugging.
+* Build the program using MSBuild: Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for VS2019". Run - MSBuild all-pairs-shortest-paths.sln /t:Rebuild /p:Configuration="Release"
+
+## Running the sample
+
+### Example Output
+```
+Device: Intel(R) Gen9
+Repeating computation 8 times to measure run time ...
+Iteration: 1
+Iteration: 2
+Iteration: 3
+...
+Iteration: 8
+Successfully computed all pairs shortest paths in parallel!
+Time sequential: 0.583029 sec
+Time parallel: 0.159223 sec
+
+```
diff --git a/...ctProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/all-pairs-shortest-paths.sln b/...ctProgramming/DPC++/GraphAlgorithms/all-pairs-shortest-paths/all-pairs-shortest-paths.sln
@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.30104.148
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "apsp", "all-pairs-shortest-paths.vcxproj", "{28C45A93-5D80-4635-BAFB-D5386EEEA466}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{28C45A93-5D80-4635-BAFB-D5386EEEA466}.Debug|x64.ActiveCfg = Debug|x64
+		{28C45A93-5D80-4635-BAFB-D5386EEEA466}.Debug|x64.Build.0 = Debug|x64
+		{28C45A93-5D80-4635-BAFB-D5386EEEA466}.Release|x64.ActiveCfg = Release|x64
+		{28C45A93-5D80-4635-BAFB-D5386EEEA466}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {2ADBAE67-858D-4FA7-B81F-3C9AF9347EFA}
+	EndGlobalSection
+EndGlobal