diff --git a/DirectProgramming/C++/CombinationalLogic/MandelbrotOMP/README.md b/DirectProgramming/C++/CombinationalLogic/MandelbrotOMP/README.md index 63dbe2063b..b7e29823b0 100644 --- a/DirectProgramming/C++/CombinationalLogic/MandelbrotOMP/README.md +++ b/DirectProgramming/C++/CombinationalLogic/MandelbrotOMP/README.md @@ -5,9 +5,9 @@ Mandelbrot is an infinitely complex fractal patterning that is derived from a si | Optimized for | Description |:--- |:--- -| OS | MacOS Catalina or newer; Linux* Ubuntu* 18.04 +| OS | MacOS Catalina or newer; | Hardware | Skylake with GEN9 or newer -| Software | Intel® C++ Compiler 19.1 or newer +| Software | Intel® oneAPI C++ Compiler Classic | What you will learn | How to optimize a scalar implementation using OpenMP pragmas | Time to complete | 15 minutes diff --git a/DirectProgramming/C++/CombinationalLogic/MandelbrotOMP/sample.json b/DirectProgramming/C++/CombinationalLogic/MandelbrotOMP/sample.json index ece8ab4756..9bf6d60004 100644 --- a/DirectProgramming/C++/CombinationalLogic/MandelbrotOMP/sample.json +++ b/DirectProgramming/C++/CombinationalLogic/MandelbrotOMP/sample.json @@ -2,16 +2,12 @@ "name": "Mandelbrot OpenMP*", "description": "Calculates the mandelbrot set and outputs a bmp image representation using OpenMP*", "categories": ["Toolkit/Intel® oneAPI HPC Toolkit"], - "os": ["linux", "darwin"], - "builder": ["cmake"], + "os": ["darwin"], + "builder": ["make"], "languages": [{"cpp":{}}], "toolchain": ["icc"], "guid": "DD113F58-4D91-41BB-B46E-6CF2C0D9F6F9", "ciTests": { - "linux": [ - { "id": "standard", "steps": [ "make", "make run", "make clean" ] }, - { "id": "perf_num", "env": [ "export perf_num=1" ], "steps": [ "make", "make run", "make clean" ] } - ], "darwin": [ { "id": "standard", "steps": [ "make", "make run", "make clean" ] }, { "id": "perf_num", "env": [ "export perf_num=1" ], "steps": [ "make", "make run", "make clean" ] } diff --git a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/README.md b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/README.md index 50e0f51b90..a99d5b006c 100644 --- a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/README.md +++ b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/README.md @@ -4,10 +4,10 @@ The intrinsic samples are designed to show how to utilize the intrinsics support | Optimized for | Description |:--- |:--- -| OS | Linux* Ubuntu* 18.04; MacOS* Catalina* or newer +| OS | MacOS* Catalina* or newer | Hardware | Skylake with GEN9 or newer -| Software | Intel® C++ Compiler 2021.1 or newer; -| What you will learn | How to utlize intrinsics supported by the Intel® C++ Compiler +| Software | Intel® oneAPI C++ Compiler Classic +| What you will learn | How to utlize intrinsics supported by the Intel® oneAPI C++ Compiler Classic | Time to complete | 15 minutes diff --git a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/sample.json b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/sample.json index 8bc2fbc314..40360e7968 100644 --- a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/sample.json +++ b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/sample.json @@ -1,17 +1,13 @@ { "name": "Intrinsics C++", - "description": "Demonstrates the intrinsic functions of the Intel® C++ Compiler", + "description": "Demonstrates the intrinsic functions of the Intel® oneAPI C++ Compiler Classic", "categories": ["Toolkit/Intel® oneAPI HPC Toolkit"], - "os": ["linux", "darwin"], - "builder": ["cmake"], + "os": ["darwin"], + "builder": ["make"], "languages": [{"cpp":{}}], "toolchain": ["icc"], "guid": "ACD0E89E-67CC-4CB4-87AB-B12B84962EAF", "ciTests": { - "linux": [ - { "id": "standard", "steps": [ "make", "make run", "make clean" ] }, - { "id": "debug", "steps": [ "make debug", "make debug_run", "make clean" ] } - ], "darwin": [ { "id": "standard", "steps": [ "make", "make run", "make clean" ] }, { "id": "debug", "steps": [ "make debug", "make debug_run", "make clean" ] } diff --git a/DirectProgramming/C++/GraphTraversal/MergesortOMP/README.md b/DirectProgramming/C++/GraphTraversal/MergesortOMP/README.md index ce51161a1a..43356ac52a 100644 --- a/DirectProgramming/C++/GraphTraversal/MergesortOMP/README.md +++ b/DirectProgramming/C++/GraphTraversal/MergesortOMP/README.md @@ -6,10 +6,10 @@ For more details about merge sort algorithm and top-down implementation, please | Optimized for | Description |:--- |:--- -| OS | Linux* Ubuntu* 18.04; MacOS Catalina or newer +| OS | MacOS Catalina or newer | Hardware | Skylake with GEN9 or newer -| Software | Intel® C++ Compiler 19.1 or newer; -| What you will learn | How to accelerate a scalar program using OpenMP tasks +| Software | Intel® oneAPI C++ Compiler Classic +| What you will learn | How to accelerate a scalar program using OpenMP* tasks | Time to complete | 15 minutes Performance number tabulation diff --git a/DirectProgramming/C++/GraphTraversal/MergesortOMP/sample.json b/DirectProgramming/C++/GraphTraversal/MergesortOMP/sample.json index cde821978e..a58affeae8 100644 --- a/DirectProgramming/C++/GraphTraversal/MergesortOMP/sample.json +++ b/DirectProgramming/C++/GraphTraversal/MergesortOMP/sample.json @@ -2,16 +2,12 @@ "name": "MergeSort C++/OpenMP*", "description": "Classic sorting algorithm using OpenMP*", "categories": ["Toolkit/Intel® oneAPI HPC Toolkit"], - "os": ["linux", "darwin"], - "builder": ["cmake"], + "os": ["darwin"], + "builder": ["make"], "languages": [{"cpp":{}}], "toolchain": ["icc"], "guid": "5AFED65F-F725-411D-B21C-B59008D1166D", "ciTests": { - "linux": [ - { "id": "standard", "steps": [ "make", "make run", "make clean" ] }, - { "id": "perf_num", "env": [ "export perf_num=1" ], "steps": [ "make", "make run", "make clean" ] } - ], "darwin": [ { "id": "standard", "steps": [ "make", "make run", "make clean" ] }, { "id": "perf_num", "env": [ "export perf_num=1" ], "steps": [ "make", "make run", "make clean" ] } diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/License.txt b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/License.txt index 9cde07f558..8f608e972a 100644 --- a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/License.txt +++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/License.txt @@ -1,4 +1,4 @@ -Copyright Intel Corporation +Copyright 2019 Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/README.md b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/README.md index 312bb4e783..53da36b8b1 100644 --- a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/README.md +++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/README.md @@ -8,8 +8,8 @@ For comprehensive instructions regarding DPC++ Programming, go to https://softwa |:--- |:--- | OS | Linux* Ubuntu* 18.04; Windows 10 | Hardware | Skylake with GEN9 or newer -| Software | Intel® oneAPI DPC++ Compiler beta; -| What you will learn | How to offload the computation to GPU using Intel DPC++ compiler +| Software | Intel® oneAPI DPC++/C++ Compiler +| What you will learn | How to offload the computation to GPU using the Intel® oneAPI DPC++/C++ Compiler | Time to complete | 15 minutes ## Purpose diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj index 19bac293d5..8a4eaa9d40 100644 --- a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj +++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj @@ -114,7 +114,6 @@ Console true - $(ONEAPI_ROOT)\compiler\latest\windows\bin\libsycl-complex.o @@ -152,10 +151,9 @@ true true true - $(ONEAPI_ROOT)\compiler\latest\windows\bin\libsycl-complex.o - \ No newline at end of file + diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/CMakeLists.txt b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/CMakeLists.txt index 9cd8f8f64d..4c3d57303d 100644 --- a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/CMakeLists.txt +++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/CMakeLists.txt @@ -2,10 +2,10 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -std=c++17") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") add_executable(mandelbrot main.cpp) -target_link_libraries(mandelbrot OpenCL sycl $ENV{ONEAPI_ROOT}/compiler/latest/linux/lib/libsycl-complex.o) -add_custom_target(run ${CMAKE_COMMAND} -E env SYCL_BE=PI_OPENCL ./mandelbrot) +target_link_libraries(mandelbrot OpenCL sycl) +add_custom_target(run ./mandelbrot) add_executable(mandelbrot_usm main.cpp) target_compile_definitions(mandelbrot_usm PRIVATE MANDELBROT_USM) -target_link_libraries(mandelbrot_usm OpenCL sycl $ENV{ONEAPI_ROOT}/compiler/latest/linux/lib/libsycl-complex.o) -add_custom_target(run_usm ${CMAKE_COMMAND} -E env SYCL_BE=PI_OPENCL ./mandelbrot_usm) +target_link_libraries(mandelbrot_usm OpenCL sycl) +add_custom_target(run_usm ./mandelbrot_usm) diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/mandel.hpp b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/mandel.hpp index 991478032c..7c261a5e56 100644 --- a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/mandel.hpp +++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/mandel.hpp @@ -33,6 +33,10 @@ struct MandelParameters { int max_iterations_; typedef std::complex ComplexF; + static std::complex complex_square( std::complex c) + { + return std::complex( c.real()*c.real() - c.imag()*c.imag(), c.real()*c.imag()*2 ); + } MandelParameters(int row_count, int col_count, int max_iterations) : row_count_(row_count), @@ -41,7 +45,7 @@ struct MandelParameters { int row_count() const { return row_count_; } int col_count() const { return col_count_; } - int max_iterations() const { return max_iterations_; } +int max_iterations() const { return max_iterations_; } // Scale from 0..row_count to -1.5..0.5 float ScaleRow(int i) const { return -1.5f + (i * (2.0f / row_count_)); } @@ -63,7 +67,8 @@ struct MandelParameters { break; } - z = z * z + c; + // z = z * z + c; + z = complex_square(z) + c; count++; } diff --git a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/README.md b/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/README.md index 759b7e1576..db05d53647 100644 --- a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/README.md +++ b/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/README.md @@ -7,8 +7,8 @@ For comprehensive instructions regarding DPC++ Programming, go to https://softwa |:--- |:--- | OS | Linux Ubuntu 18.04, Windows 10 | Hardware | Skylake with GEN9 or newer -| Software | Intel® oneAPI DPC++ Compiler (beta) -| What you will learn | The Sepia Filter sample demonstrates the following using the oneAPI DPC++ compiler
  • Writing a custom device selector class
  • Offloading compute intensive parts of the application using both lamba and functor kernels
  • Measuring kernel execution time by enabling profiling
+| Software | Intel® oneAPI DPC++/C++ Compiler +| What you will learn | The Sepia Filter sample demonstrates the following using the Intel® oneAPI DPC++/C++ Compiler
  • Writing a custom device selector class
  • Offloading compute intensive parts of the application using both lamba and functor kernels
  • Measuring kernel execution time by enabling profiling
| Time to complete | 20 minutes ## Purpose diff --git a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/sample.json b/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/sample.json index 6abd3d250f..e2ff514d31 100644 --- a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/sample.json +++ b/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/sample.json @@ -1,7 +1,7 @@ { "guid": "B9C425DB-A3AD-4FCB-9CA0-1909E5189FB7", "name": "Sepia Filter", - "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU"], + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/Intel® oneAPI DPC++/C++ Compiler/CPU and GPU"], "toolchain": ["dpcpp"], "description": "A program that converts an image to sepia tone", "languages": [{ diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/complex_mult/README.md b/DirectProgramming/DPC++/DenseLinearAlgebra/complex_mult/README.md index 4ef647b606..246791a6fd 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/complex_mult/README.md +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/complex_mult/README.md @@ -11,7 +11,7 @@ custom types of classes in a DPC++ program |:--- |:--- | OS | Linux Ubuntu 18.04, Windows 10 | Hardware | Skylake with GEN9 or newer -| Software | Intel® oneAPI DPC++ Compiler (beta) +| Software | Intel® oneAPI DPC++/C++ Compiler | What you will learn | Using custom type classes and offloads complex number computations to GPU using Intel DPC++ | Time to complete | 15 minutes diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/complex_mult/sample.json b/DirectProgramming/DPC++/DenseLinearAlgebra/complex_mult/sample.json index 5b2c4309a1..2824cf8808 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/complex_mult/sample.json +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/complex_mult/sample.json @@ -1,7 +1,7 @@ { "guid": "D725E06E-0ECE-44F8-910D-AD1A8C89ED89", "name": "Complex number Multiplication", - "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ], + "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/Intel® oneAPI DPC++/C++ Compiler/CPU and GPU" ], "description": "program that computes the multiplication of a Complex number", "toolchain": [ "dpcpp" ], "languages": [ { "cpp": { "properties": { "projectOptions": [ { "projectType": "makefile" } ] } } } ], diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/README.md b/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/README.md index c50970d237..34eebdfd9e 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/README.md +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/README.md @@ -10,7 +10,7 @@ For comprehensive instructions regarding DPC++ Programming, go to https://softwa |:--- |:--- | OS | Linux* Ubuntu* 18.04, Windows 10* | Hardware | Skylake with GEN9 or newer -| Software | Intel® oneAPI DPC++ Compiler beta, Intel® C/C++ Compiler beta +| Software | Intel® oneAPI DPC++/C++ Compiler, Intel® oneAPI C++ Compiler Classic | What you will learn | Offloads computations on 2D arrays to GPU using Intel DPC++ and OpenMP | Time to complete | 15 minutes diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/README.md b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/README.md index 662c9df298..7c156e79f7 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/README.md +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/README.md @@ -8,7 +8,7 @@ For comprehensive instructions regarding DPC++ Programming, go to https://softwa |:--- |:--- | OS | Linux* Ubuntu* 18.04, Windows 10 | Hardware | Skylake with GEN9 or newer, Intel(R) Programmable Acceleration Card with Intel(R) Arria(R) 10 GX FPGA -| Software | Intel® oneAPI DPC++ Compiler (beta) +| Software | Intel® oneAPI DPC++/C++ Compiler diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/sample.json b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/sample.json index cb7d58bb6a..619d872475 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/sample.json +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/sample.json @@ -1,7 +1,7 @@ { "guid" : "49C65CB6-F9FA-4E3C-B8BE-4A141E4E0F07", "name": "Simple Add", - "categories": ["Toolkit/Get Started", "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU, GPU and FPGA"], + "categories": ["Toolkit/Get Started", "Toolkit/Intel® oneAPI Base Toolkit/Intel® oneAPI DPC++/C++ Compiler/CPU, GPU and FPGA"], "description": "Simple program that adds two large vectors in parallel. Provides a ‘Hello World!’ like sample to ensure your environment is setup correctly using Data Parallel C++.", "toolchain": ["dpcpp"], "languages": [{"cpp": {"properties": {"projectOptions": [{"projectType": "makefile"}]}}}], diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/vector-add/README.md b/DirectProgramming/DPC++/DenseLinearAlgebra/vector-add/README.md index 9f32169505..ba8a52deaa 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/vector-add/README.md +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/vector-add/README.md @@ -8,7 +8,7 @@ For comprehensive instructions regarding DPC++ Programming, go to https://softwa |:--- |:--- | OS | Linux* Ubuntu* 18.04, Windows 10 | Hardware | Skylake with GEN9 or newer, Intel(R) Programmable Acceleration Card with Intel(R) Arria(R) 10 GX FPGA -| Software | Intel® oneAPI DPC++ Compiler (beta) +| Software | Intel® oneAPI DPC++/C++ Compiler ## Purpose The `vector-add` is a simple program that adds two large vectors of integers and verifies the results. This program is implemented using C++ and Data Parallel C++ (DPC++) for Intel(R) CPU and accelerators. diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/vector-add/sample.json b/DirectProgramming/DPC++/DenseLinearAlgebra/vector-add/sample.json index 9737eea2fb..f86a214617 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/vector-add/sample.json +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/vector-add/sample.json @@ -1,7 +1,7 @@ { "guid":"b1b58be7-e22e-4ca2-ba59-6887b2f1be6c", "name": "Vector Add", - "categories": ["Toolkit/Get Started", "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU, GPU and FPGA"], + "categories": ["Toolkit/Get Started", "Toolkit/Intel® oneAPI Base Toolkit/Intel® oneAPI DPC++/C++ Compiler/CPU, GPU and FPGA"], "description": "Simple program that adds two large vectors in parallel. Provides a ‘Hello World!’ like sample to ensure your environment is setup correctly using simple Data Parallel C++.", "toolchain": ["dpcpp"], "languages": [{"cpp": {"properties": {"projectOptions": [{"projectType": "makefile"}]}}}], diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md index 061f753ed0..3e28e7c495 100644 --- a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md @@ -13,8 +13,8 @@ and search based on relevant terms noted in the comments. |:--- |:--- | OS | Linux Ubuntu 18.04 | Hardware | Skylake with GEN9 or newer -| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) -| What you will learn | Implement bitonic sort using Intel DPC++ compiler +| Software | Intel® oneAPI DPC++/C++ Compiler +| What you will learn | Implement bitonic sort using Intel® oneAPI DPC++/C++ Compiler | Time to complete | 15 minutes @@ -51,7 +51,7 @@ if a compatible GPU is not detected. ## Key Implementation Details The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command g -roups. Unified Shared Memory (USM) is used for data management. +roups. Unified Shared Memory (USM) and Buffer Object are used for data management. ## License This code sample is licensed under MIT license @@ -117,7 +117,10 @@ the ascending order is verified, the application will display a “Success!” m $ ./bitonic-sort 21 47 Array size: 2097152, seed: 47 Device: Intel(R) Gen9 HD Graphics NEO -Kernel time: 0.416827 sec -CPU serial time: 0.60523 sec +Warm up ... +Kernel time using USM: 0.248422 sec +Kernel time using buffer allocation: 0.253364 sec +CPU serial time: 0.628803 sec + Success! ``` diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json index c382d764e1..75efdfa0f0 100644 --- a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json @@ -1,7 +1,7 @@ { "guid": "4D5B57B8-6F34-4A11-89F5-3F07E766DB39", "name": "bitonic-sort", - "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ], + "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/Intel® oneAPI DPC++/C++ Compiler/CPU and GPU" ], "description": "Bitonic Sort using Intel® oneAPI DPC++ Language", "toolchain": [ "dpcpp" ], "targetDevice": [ "CPU", "GPU" ], diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp index e0e4312520..0153bf4cd1 100644 --- a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp @@ -35,38 +35,93 @@ // data to the kernel. The kernel swaps the elements accordingly in parallel. // #include -#include #include +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp +#include "dpc_common.hpp" + using namespace sycl; using namespace std; -void ParallelBitonicSort(int a[], int n, queue &q) { +#define DEBUG 0 + +void ParallelBitonicSort(int data_gpu[], int n, queue &q) { // n: the exponent used to set the array size. Array size = power(2, n) int size = pow(2, n); + int* a = data_gpu; + + // step from 0, 1, 2, ...., n-1 + for (int step = 0; step < n; step++) { + // for each step s, stage goes s, s-1, ..., 0 + for (int stage = step; stage >= 0; stage--) { + int seq_len = pow(2, stage + 1); + + // Constant used in the kernel: 2**(step-stage). + int two_power = 1 << (step - stage); + // Offload the work to kernel. + q.submit([&](handler &h) { + h.parallel_for(range<1>(size), [=](id<1> i) { + // Assign the bitonic sequence number. + int seq_num = i / seq_len; + + // Variable used to identified the swapped element. + int swapped_ele = -1; + + // Because the elements in the first half in the bitonic + // sequence may swap with elements in the second half, + // only the first half of elements in each sequence is + // required (seq_len/2). + int h_len = seq_len / 2; + + if (i < (seq_len * seq_num) + h_len) swapped_ele = i + h_len; + + // Check whether increasing or decreasing order. + int odd = seq_num / two_power; + + // Boolean variable used to determine "increasing" or + // "decreasing" order. + bool increasing = ((odd % 2) == 0); + + // Swap the elements in the bitonic sequence if needed + if (swapped_ele != -1) { + if (((a[i] > a[swapped_ele]) && increasing) || + ((a[i] < a[swapped_ele]) && !increasing)) { + int temp = a[i]; + a[i] = a[swapped_ele]; + a[swapped_ele] = temp; + } + } + }); + }); + q.wait(); + } // end stage + } // end step +} + +void ParallelBitonicSortBuffer(int data_gpu[], int n, queue &q) { + // n: the exponent used to set the array size. Array size = power(2, n) + int size = pow(2, n); + + buffer input (data_gpu, size); + // step from 0, 1, 2, ...., n-1 for (int step = 0; step < n; step++) { // for each step s, stage goes s, s-1, ..., 0 for (int stage = step; stage >= 0; stage--) { - // In each state, construct a number (num_seq) of bitonic sequences of - // size seq_len (2, 4, ...) num_seq stores the number of bitonic sequences - // at each stage. seq_len stores the length of the bitonic sequence at - // each stage. int seq_len = pow(2, stage + 1); -#if DEBUG - int num_seq = pow(2, (n - stage - 1)); // Used for debug purpose. - std::cout << "step num:" << step << " stage num:" << stage - << " num_seq:" << num_seq << "(" << seq_len << ") => "; -#endif + // Constant used in the kernel: 2**(step-stage). int two_power = 1 << (step - stage); // Offload the work to kernel. q.submit([&](handler &h) { - h.parallel_for(range<1>(size), [=](id<1> i) { + auto a = input.get_access(h); + + h.parallel_for(range<1>(size), [=](id<1> i) { // Assign the bitonic sequence number. - int seq_num = i / seq_len; + int seq_num = i / seq_len; // Variable used to identified the swapped element. int swapped_ele = -1; @@ -190,40 +245,62 @@ int main(int argc, char *argv[]) { std::cout << "Device: " << q.get_device().get_info() << "\n"; + // Memory allocated for host access only. + int *data_cpu = (int *)malloc(size * sizeof(int)); + // USM allocation using malloc_shared: data stores a sequence of random // numbers. - int *data = malloc_shared(size, q); + int *data_usm = malloc_shared(size, q); - // Memory allocated for host access only. - int *data2 = (int *)malloc(size * sizeof(int)); + // Memory allocated to store gpu results using buffer allocation + int *data_gpu = (int *)malloc(size * sizeof(int)); // Initialize the array randomly using a seed. srand(seed); - for (int i = 0; i < size; i++) data[i] = data2[i] = rand() % 1000; + for (int i = 0; i < size; i++) + data_usm[i] = data_gpu[i] = data_cpu[i] = rand() % 1000; #if DEBUG std::cout << "\ndata before:\n"; - DisplayArray(data, size); + DisplayArray(data_usm, size); #endif + // Warm up + std::cout << "Warm up ...\n"; + ParallelBitonicSort(data_usm, n, q); + // Start timer dpc_common::TimeInterval t_par; - ParallelBitonicSort(data, n, q); + // Parallel sort using USM + ParallelBitonicSort(data_usm, n, q); - std::cout << "Kernel time: " << t_par.Elapsed() << " sec\n"; + std::cout << "Kernel time using USM: " << t_par.Elapsed() << " sec\n"; #if DEBUG - std::cout << "\ndata after sorting using parallel bitonic sort:\n"; - DisplayArray(data, size); + std::cout << "\ndata_usm after sorting using parallel bitonic sort:\n"; + DisplayArray(data_usm, size); #endif + // Start timer + dpc_common::TimeInterval t_par2; + + // Parallel sort using buffer allocation + ParallelBitonicSortBuffer(data_gpu, n, q); + + std::cout << "Kernel time using buffer allocation: " << t_par2.Elapsed() << " sec\n"; + +#if DEBUG + std::cout << "\ndata_gpu after sorting using parallel bitonic sort:\n"; + DisplayArray(data_gpu, size); +#endif + // Start timer dpc_common::TimeInterval t_ser; // Bitonic sort in CPU (serial) - BitonicSort(data2, n); + BitonicSort(data_cpu, n); std::cout << "CPU serial time: " << t_ser.Elapsed() << " sec\n"; @@ -231,18 +308,22 @@ int main(int argc, char *argv[]) { bool pass = true; for (int i = 0; i < size - 1; i++) { // Validate the sequence order is increasing in both kernel and CPU. - if ((data[i] > data[i + 1]) || (data[i] != data2[i])) { + if ((data_usm[i] > data_usm[i + 1]) || (data_usm[i] != data_cpu[i])) { pass = false; break; } + + if ((data_gpu[i] > data_gpu[i + 1]) || (data_gpu[i] != data_cpu[i])) { + pass = false; + break; + } } - // Clean USM resources. - free(data, q); - - // Clean CPU memory. - free(data2); - + // Clean resources. + free(data_cpu); + free(data_usm, q); + free(data_gpu); + if (!pass) { std::cout << "\nFailed!\n"; return -2; diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/CMakeLists.txt b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/CMakeLists.txt new file mode 100644 index 0000000000..07ec9bb778 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/CMakeLists.txt @@ -0,0 +1,30 @@ +# required cmake version +cmake_minimum_required(VERSION 3.5) + +project (hidden-markov-models) + +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp-cl") +else() + set(CMAKE_CXX_COMPILER "dpcpp") +endif() + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17") + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl") + +add_executable (hidden-markov-models src/hidden-markov-models.cpp) + +add_custom_target (run + COMMAND hidden-markov-models + WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} +) + diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/License.txt b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/License.txt new file mode 100644 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/README.md b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/README.md new file mode 100644 index 0000000000..8a880848c6 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/README.md @@ -0,0 +1,89 @@ +#`DPC++ Hidden Markov Model` Sample +The HMM (Hidden Markov Model) sample presents a statistical model using a Markov process to present graphable nodes that are otherwise in an unobservable state or “hidden”. This technique is helpful in pattern recognition such as speech, handwriting, gesture recognition, part-of-speech tagging, partial discharges and bioinformatics. The sample offloads the complexity of the Markov process to the GPU. + +The directed edges of this graph are possible transitions beetween nodes or states defined with the following parameters: the number of states is N, the transition matrix A is a square matrix of size N. Each element with indexes (i,j) of this matrix determines the probability to move from the state i to the state j on any step of the Markov process (i and j can be the same if the state does not change on the taken step). + +The main assumption of the HMM is that there are visible observations that depend on the current Markov process. That dependency can be described as a conditional probability distribution (represented by emission matrix). The problem is to find out the most likely chain of the hidden Markov states using the given observations set. + +##Requirements and sample info + +| Optimized for | Description +|:--- |:--- +| OS | Linux* Ubuntu* 18.04, Windows 10 +| Hardware | Skylake with GEN9 or newer, +| Software | Intel® oneAPI DPC++ Compiler (beta) +| What you will learn | Implement Viterbi algorithm to get the most likely path that consists of the hidden states +| Time to complete | 1 minute + +##Purpose + +The sample can use GPU offload to compute sequential steps of multiple graph traversals simultaneously. + +This code sample implements the Viterbi algorithm which is a dynamic programming algorithm for finding the most likely sequence of hidden states—called the Viterbi path—that results in a sequence of observed events, especially in the context of Markov information sources and HMM. + +- Initially, the dataset for algorithm processing is generated: initial states probability distribution Pi, transition matrix A, emission matrix B and the sequence or the observations produced by hidden Markov process. +- First, the matrix of Viterbi values on the first states are initialized using distribution Pi and emission matrix B. The matrix of back pointers is initialized with default values -1. +- Then, for each time step the Viterbi matrix is set to the maximal possible value using A, B and Pi. +- Finally, the state with maximum Viterbi value on the last step is set as a final state of the Viterbi path and the previous nodes of this path are detemined using the correspondent rows of back pointers matrix for each of the steps except the last one. + +Note: The implementation uses logarithms of the probabilities to process small numbers correctly and to replace multiplication operations with addition operations. + +##Key Implementation details + +The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command groups. + +## License +This code sample is licensed under MIT license. + +## Building the `DPC++ Hidden Markov Model` Program for CPU and GPU + +### Include Files +The include folder is located at %ONEAPI_ROOT%\dev-utilities\latest\include on your development system. + +### On a Linux* System +1. Build the program using the following `cmake` commands. + ``` + $ cd hidden-markov-models + $ mkdir build + $ cd build + $ cmake .. + $ make + ``` + +2. Run the program: + ``` + make run + ``` + +3. Clean the program using: + ``` + make clean + ``` + +### On a Windows* System Using a Command Line Interface + * Build the program using VS2017 or VS2019 + Right click on the solution file and open using either VS2017 or VS2019 IDE. + Right click on the project in Solution explorer and select Rebuild. + From top menu select Debug -> Start without Debugging. + + * Build the program using MSBuild + Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for + VS2019" + Run - MSBuild hidden-markov-models.sln /t:Rebuild /p:Configuration="Release" + +### On a Windows* System Using Visual Studio* Version 2017 or Newer +Perform the following steps: +1. Locate and select the `hidden-markov-models.sln` file. +2. Select the configuration 'Debug' or 'Release'. +3. Select **Project** > **Build** menu option to build the selected configuration. +4. Select **Debug** > **Start Without Debugging** menu option to run the program. + +## Running the Sample +### Application Parameters +There are no editable parameters for this sample. + +### Example of Output +Device: Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz Intel(R) OpenCL +The Viterbi path is: +19 18 17 16 15 14 13 12 11 10 +The sample completed successfully! \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.filters b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.filters new file mode 100644 index 0000000000..5f08be7fdb --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.filters @@ -0,0 +1,22 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.sln b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.sln new file mode 100644 index 0000000000..10106f9039 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.30320.27 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "hidden-markov-models", "hidden-markov-models.vcxproj", "{46454D0B-76F3-45EB-A186-F315A2E22DEA}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Debug|x64.ActiveCfg = Debug|x64 + {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Debug|x64.Build.0 = Debug|x64 + {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Release|x64.ActiveCfg = Release|x64 + {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {B1D84B81-F5D5-4459-AA6E-38B695FB908B} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.user b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.user new file mode 100644 index 0000000000..fa6ed154c1 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.user @@ -0,0 +1,9 @@ + + + + WindowsLocalDebugger + + + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.vcxproj b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.vcxproj new file mode 100644 index 0000000000..e894a8cca6 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.vcxproj @@ -0,0 +1,144 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + 15.0 + {46454d0b-76f3-45eb-a186-f315a2e22dea} + Win32Proj + hidden-markov-models + $(WindowsSDKVersion.Replace("\","")) + hidden-markov-models + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + + + + + Console + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + Disabled + Level3 + + + Console + true + /Od;%(SpecifyDevCmplAdditionalOptions) + + + + + + + + + + + Console + true + true + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + Disabled + Level3 + + + Console + true + true + true + /Od;%(SpecifyDevCmplAdditionalOptions) + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.vcxproj.user b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.vcxproj.user new file mode 100644 index 0000000000..e631a72cce --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.vcxproj.user @@ -0,0 +1,17 @@ + + + + cpu + WindowsLocalDebugger + CL_CONFIG_USE_NATIVE_DEBUGGER=1 +SYCL_DEVICE_TYPE=CPU +$(LocalDebuggerEnvironment) + + + cpu + WindowsLocalDebugger + CL_CONFIG_USE_NATIVE_DEBUGGER=1 +SYCL_DEVICE_TYPE=CPU +$(LocalDebuggerEnvironment) + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/sample.json b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/sample.json new file mode 100644 index 0000000000..6dadf9de3f --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/sample.json @@ -0,0 +1,29 @@ +{ + "guid": "A63E408B-75ED-4379-A6B5-AF013C0EBA58", + "name": "hidden-markov-models", + "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ], + "description": "Bitonic Sort using Intel® oneAPI DPC++ Language", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild hidden-markov-models.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "hidden-markov-models.exe" + ] + }] + } +} diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/src/hidden-markov-models.cpp b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/src/hidden-markov-models.cpp new file mode 100644 index 0000000000..6b2e91a8c6 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/src/hidden-markov-models.cpp @@ -0,0 +1,189 @@ +//============================================================== +// Copyright © Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Hidden Markov Models: this code sample implements the Viterbi algorithm which is a dynamic +// programming algorithm for findingthe most likely sequence of hidden states— +// called the Viterbi path—that results in a sequence of observed events, +// especially in the context of Markov information sources and HMM. +// +// The sample can use GPU offload to compute sequential steps of multiple graph traversals simultaneously. +// +// - Initially, the dataset for algorithm processing is generated : initial states probability +// distribution Pi, transition matrix A, emission matrix Band the sequence or the observations +// produced by hidden Markov process. +// - First, the matrix of Viterbi values on the first states are initialized using distribution Pi +// and emission matrix B.The matrix of back pointers is initialized with default values - 1. +// - Then, for each time step the Viterbi matrix is set to the maximal possible value using A, B and Pi. +// - Finally, the state with maximum Viterbi value on the last step is set as a final state of +// the Viterbi pathand the previous nodes of this path are detemined using the correspondent rows +// of back pointers matrix for each of the steps except the last one. +// +// Note: The implementation uses logarithms of the probabilities to process small numbers correctly +// and to replace multiplication operations with addition operations. + +#include +#include +#include +#include +#include +#include + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp +#include "dpc_common.hpp" + +using namespace sycl; +using namespace std; + +// Matrix size constants. +// The number of hidden states N. +constexpr int N = 20; +// The number of possible observations M. +constexpr int M = 20; +// The lenght of the hidden states sequence T. +constexpr int T = 20; +// The parameter for generating the sequence. +constexpr int seed = 0; +// Minimal double to initialize logarithms for Viterbi values equal to 0. +constexpr double MIN_DOUBLE = -1.0 * std::numeric_limits::max(); + +bool ViterbiCondition(double x, double y, double z, double compare); + +int main() { + try { + // Initializing and generating initial probabilities for the hidden states. + double(*pi) = new double[N]; + for (int i = 0; i < N; ++i) { + pi[i] = sycl::log10(1.0f / N); + } + buffer pi_buf(pi, N); + + //Device initialization. + queue q(default_selector{}, dpc_common::exception_handler); + cout << "Device: " << q.get_device().get_info() << " " + << q.get_device().get_platform().get_info() << "\n"; + + //Buffers initialization. + buffer viterbi(range<2>(N, T)); + buffer back_pointer(range<2>(N, T)); + buffer a(range<2>(N, N)); + buffer b(range<2>(N, M)); + + // Generating transition matrix A for the Markov process. + q.submit([&](handler& h) { + auto a_acc = a.get_access(h); + h.parallel_for(range<2>(N, N), [=](id<2> index) { + // The sum of the probabilities in each row of the matrix A has to be equal to 1. + double prob = 1.0f / N; + // The algorithm computes logarithms of the probability values to improve small numbers processing. + a_acc[index] = sycl::log10(prob); + }); + }); + + // Generating emission matrix B for the Markov process. + q.submit([&](handler& h) { + auto b_acc = b.get_access(h); + h.parallel_for(range<2>(N, M), [=](id<2> index) { + // The sum of the probabilities in each row of the matrix B has to be equal to 1. + double prob = ((index[0] + index[1]) % M) * 2.0f / M / (M - 1); + // The algorithm computes logarithms of the probability values to improve small numbers processing. + b_acc[index] = (prob == 0.0f) ? MIN_DOUBLE : sycl::log10(prob); + }); + }); + + // Generating the sequence of the observations produced by the hidden Markov chain. + int(*seq) = new int[T]; + for (int i = 0; i < T; ++i) { + seq[i] = (i * i + seed) % M; + } + buffer seq_buf(seq, T); + + // Initialization of the Viterbi matrix and the matrix of back pointers. + q.submit([&](handler& h) { + auto v_acc = viterbi.get_access(h); + auto b_ptr_acc = back_pointer.get_access(h); + auto b_acc = b.get_access(h); + auto pi_acc = pi_buf.get_access(h); + auto seq_acc = seq_buf.get_access(h); + h.parallel_for(range<2>(N, T), [=](id<2> index) { + int i = index[0]; + int j = index[1]; + // At starting point only the first Viterbi values are defined and these Values are substituted + // with logarithms due to the following equation: log(x*y) = log(x) + log(y). + v_acc[index] = (j != 0) ? MIN_DOUBLE : pi_acc[i] + b_acc[i][seq_acc[0]]; + // Default values of all the back pointers are (-1) to show that they are not determined yet. + b_ptr_acc[index] = -1; + }); + }); + delete[] pi; + + // The sequential steps of the Viterbi algorithm that define the Viterbi matrix and the matrix + // of back pointers. The product of the Viterbi values and the probabilities is substituted with the sum of + // the logarithms due to the following equation: log (x*y*z) = log(x) + log(y) + log(z). + for (int j = 0; j < T - 1; ++j) { + q.submit([&](handler& h) { + auto v_acc = viterbi.get_access(h); + auto b_ptr_acc = back_pointer.get_access(h); + auto a_acc = a.get_access (h); + auto b_acc = b.get_access (h); + auto seq_acc = seq_buf.get_access (h); + + h.parallel_for(range<2>(N, N), [=](id<2> index) { + int i = index[0], k = index[1]; + // This conditional block finds the maximum possible Viterbi value on + // the current step j for the state i. + if (ViterbiCondition(v_acc[k][j], b_acc[i][seq_acc[j + 1]], a_acc[k][i], v_acc[i][j + 1])) { + v_acc[i][j + 1] = v_acc[k][j] + a_acc[k][i] + b_acc[i][seq_acc[j + 1]]; + b_ptr_acc[i][j + 1] = k; + } + }); + }); + } + delete[] seq; + + // Getting the Viterbi path based on the matrix of back pointers + buffer vit_path(range<1> {T}); + auto v_acc = viterbi.get_access(); + auto b_ptr_acc = back_pointer.get_access(); + auto vit_path_acc = vit_path.get_access(); + double v_max = MIN_DOUBLE; + // Constructing the Viterbi path. The last state of this path is the one with + // the biggest Viterbi value (the most likely state). + for (int i = 0; i < N; ++i) { + if (v_acc[i][T - 1] > v_max) { + v_max = v_acc[i][T - 1]; + vit_path_acc[T - 1] = i; + } + } + + for (int i = T - 2; i >= 0; --i) { + // Every back pointer starting from the last one contains the index of the previous + // point in Viterbi path. + vit_path_acc[i] = b_ptr_acc[vit_path_acc[i + 1]][i + 1]; + } + + cout << "The Viterbi path is: "<< std::endl; + for (int k = 0; k < T; ++k) { + cout << vit_path_acc[k] << " "; + } + cout << std::endl; + + } catch (sycl::exception const& e) { + // Exception processing + cout << "An exception is caught!\n"; + cout << "Error message:" << e.what(); + terminate(); + } + cout << "The sample completed successfully!" << std::endl; + return 0; +} + +// The method checks if all three components of the sum are not equivalent to logarithm of zero +// (that is incorrect value and is substituted with minimal possible value of double) and that +// the Viterbi value on the new step exceeds the current one. +bool ViterbiCondition(double x, double y, double z, double compare) { + return (x > MIN_DOUBLE) && (y > MIN_DOUBLE) && (z > MIN_DOUBLE) && (x + y + z > compare); +} diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt new file mode 100644 index 0000000000..85fcec4963 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt @@ -0,0 +1,30 @@ +# required cmake version +cmake_minimum_required(VERSION 3.5) + +project (PrefixSum) + +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp") +else() + set(CMAKE_CXX_COMPILER "dpcpp") +endif() + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17") + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl") + +add_executable (PrefixSum src/PrefixSum.cpp) + +add_custom_target (run + COMMAND PrefixSum 21 47 + WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} +) + diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt new file mode 100644 index 0000000000..415025cf03 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln new file mode 100644 index 0000000000..3587a92e74 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.29926.136 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "PrefixSum", "PrefixSum.vcxproj", "{BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.ActiveCfg = Debug|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.Build.0 = Debug|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.ActiveCfg = Release|x64 + {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {9B9594EB-112B-4FAE-AD1F-04BD8FF34B9F} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj new file mode 100644 index 0000000000..6a6309b96b --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj @@ -0,0 +1,137 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + 15.0 + {bc12abe6-7951-47d6-93dc-126f8a5fcfd2} + Win32Proj + PrefixSum + 10.0.17763.0 + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + + + + + Console + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include + + + Console + true + + + + + + + + + + + Console + true + true + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters new file mode 100644 index 0000000000..2003dce0f2 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters @@ -0,0 +1,22 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user new file mode 100644 index 0000000000..7288fa06dd --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user @@ -0,0 +1,11 @@ + + + + 21 47 + WindowsLocalDebugger + + + 21 47 + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md new file mode 100644 index 0000000000..6bbc2cfdfb --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md @@ -0,0 +1,124 @@ +# `Prefix Sum` sample + +This code sample demonstrates the implementation of parallel prefix sum using Intel Data Parallel C++ to +offload the computation to a GPU. In this implementation, a random sequence of 2**n elements is given +(n is a positive number) as input, the algorithm compute the prefix sum in parallel. The result sequence is +in ascending order. + +For comprehensive instructions regarding DPC++ Programming, go to +https://software.intel.com/en-us/oneapi-programming-guide +and search based on relevant terms noted in the comments. + +| Optimized for | Description +|:--- |:--- +| OS | Linux Ubuntu 18.04 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) +| What you will learn | Implement bitonic sort using Intel DPC++ compiler +| Time to complete | 15 minutes + + +## Purpose + +Given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and returns +a new sequence y0, y1, y2, ..., yn so that + +y0 = x0 +y1 = x0 + x1 +y2 = x0 + x1 + x2 +..... +yn = x0 + x1 + x2 + ... + xn + +Below is the pseudo code for computing prefix sum in parallel: + +n is power of 2 (1, 2, 4 , 8, 16, ...): + +for i from 0 to [log2 n] - 1 do + for j from 0 to (n-1) do in parallel + if j<2^i then + x_{j}^{i+1} <- x_{j}^{i}} + else + x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}} + +In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i. +Given n processors to perform each iteration of the inner loop in constant time, the algorithm +as a whole runs in O(log n) time, the number of iterations of the outer loop. + +The code will attempt first to execute on an available GPU and fallback to the system's CPU if a +compatible GPU is not detected. + +## Key Implementation Details + +The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command +groups. + +## License +This code sample is licensed under MIT license + +## Building the `PrefixSum` Program for CPU and GPU + +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, +FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI +Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System +1. Build the program using the following `cmake` commands. + ``` + $ cd PrefixSum + $ mkdir build + $ cd build + $ cmake .. + $ make + ``` + +2. Run the program: + ``` + make run + ``` + +3. Clean the program using: + ``` + make clean + ``` + +### On a Windows* System + * Build the program using VS2017 or VS2019 + Right click on the solution file and open using either VS2017 or VS2019 IDE. + Right click on the project in Solution explorer and select Rebuild. + From top menu select Debug -> Start without Debugging. + + * Build the program using MSBuild + Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for + VS2019" + Run - MSBuild PrefixSum.sln /t:Rebuild /p:Configuration="Release" + +## Running the sample +### Application Parameters + + Usage: PrefixSum + +where + +exponent is a positive number. The according length of the sequence is 2**exponent. + +seed is the seed used by the random generator to generate the randomness. + +The sample offloads the computation to GPU and then performs the verification the results in the CPU. +The results are verified if yk = yk-1 + xk the original compared. If the results are matched and +the ascending order is verified, the application will display a “Success!” message. + +### Example of Output +``` +$ ./PrefixSum 21 47 + +Sequence size: 2097152, seed: 47 +Num iteration: 21 +Device: Intel(R) Gen9 HD Graphics NEO +Kernel time: 170 ms + +Success! +``` diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json new file mode 100644 index 0000000000..def268a2f8 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json @@ -0,0 +1,29 @@ +{ + "guid": "5D274319-02EE-44B0-B055-71E4C50D05E0", + "name": "PrefixSum", + "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ], + "description": "Compute Prefix Sum using Intel® oneAPI DPC++ Language", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild PrefixSum.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "PrefixSum.exe 21 47" + ] + }] + } +} diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp new file mode 100644 index 0000000000..b2af8367a7 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp @@ -0,0 +1,239 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// PrefixSum: this code sample implements the inclusive scan (prefix sum) in parallel. That +// is, given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and +// returns a new sequence y0, y1, y2, ..., yn so that +// +// y0 = x0 +// y1 = x0 + x1 +// y2 = x0 + x1 + x2 +// ..... +// yn = x0 + x1 + x2 + ... + xn +// +// Below is the pseudo code for computing prefix sum in parallel: +// +// n is power of 2 (1, 2, 4 , 8, 16, ...): +// +// for i from 0 to [log2 n] - 1 do +// for j from 0 to (n-1) do in parallel +// if j<2^i then +// x_{j}^{i+1} <- x_{j}^{i}} +// else +// x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}} +// +// In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i. +// Given n processors to perform each iteration of the inner loop in constant time, the algorithm as +// a whole runs in O(log n) time, the number of iterations of the outer loop. +// + +#include + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp +#include "dpc_common.hpp" + +using namespace sycl; +using namespace std; + +void Show(int a[], int arraysize) +{ + for (int i = 0; i < arraysize; ++i) + { + std::cout << a[i] << " "; + if ((i % 16) == 15) std::cout << "\n"; + } + + std::cout << "\n"; + return; +} + +int* ParallelPrefixSum(int* prefix1, int* prefix2, unsigned int nb, queue &q) +{ + unsigned int two_power = 1; + unsigned int num_iter = log2(nb); + //unsigned int uintmax = UINT_MAX; + int* result = NULL; + + // std::cout << "uintmax " << uintmax << " " << log2(uintmax) << "\n"; + // Buffer scope + { + buffer prefix1_buf(prefix1, range<1>{nb}); + buffer prefix2_buf(prefix2, range<1>{nb}); + + // Iterate over the necessary iterations. + for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) { + + // Submit command group for execution + q.submit([&](handler& h) { + // Create accessors + auto prefix1_acc = prefix1_buf.get_access(h); + auto prefix2_acc = prefix2_buf.get_access(h); + + if (iter % 2 == 0) { + h.parallel_for(range<1>(nb), [=](id<1> j) { + if (j < two_power) { + prefix2_acc[j] = prefix1_acc[j]; + } + else { + prefix2_acc[j] = prefix1_acc[j] + prefix1_acc[j - two_power]; + } + }); // end parallel for loop in kernel + result = prefix2; + //std::cout << "return prefix2\n"; + } + else { + h.parallel_for(range<1>(nb), [=](id<1> j) { + if (j < two_power) { + prefix1_acc[j] = prefix2_acc[j]; + } + else { + prefix1_acc[j] = prefix2_acc[j] + prefix2_acc[j - two_power]; + } + }); // end parallel for loop in kernel + result = prefix1; + //std::cout << "return prefix1\n"; + } + }); // end device queue + } // end iteration + } // Buffer scope + + // Wait for commands to complete. Enforce synchronization on the command queue + q.wait_and_throw(); + + return result; +} +/* +void PrefixSum(int* x, unsigned int nb) +{ + unsigned int two_power = 1; + unsigned int num_iter = log2(nb); + int temp = 0; + + // Iterate over the necessary iterations + for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) { + //Show(x, nb); + // std::cout << "two_power: " << two_power << "\n"; + for (unsigned int j = nb; j > 0; j--) { + if (j < two_power) { + x[j] = x[j]; + } + else { + x[j] = x[j] + x[j - two_power]; + } + } + } +} +*/ +void Usage(std::string prog_name, int exponent) { + std::cout << " Incorrect parameters\n"; + std::cout << " Usage: " << prog_name << " n k \n\n"; + std::cout << " n: Integer exponent presenting the size of the input array. The number of el\ +ement in\n"; + std::cout << " the array must be power of 2 (e.g., 1, 2, 4, ...). Please enter the corre\ +sponding\n"; + std::cout << " exponent betwwen 0 and " << exponent - 1 << ".\n"; + std::cout << " k: Seed used to generate a random sequence.\n"; +} + +int main(int argc, char* argv[]) { + unsigned int nb, seed; + int n, exp_max = log2(std::numeric_limits::max()); + + // Read parameters. + try { + n = std::stoi(argv[1]); + + // Verify the boundary of acceptance. + if (n < 0 || n >= exp_max) { + Usage(argv[0], exp_max); + return -1; + } + + seed = std::stoi(argv[2]); + nb = pow(2, n); + } catch (...) { + Usage(argv[0], exp_max); + return -1; + } + + std::cout << "\nSequence size: " << nb << ", seed: " << seed; + + int num_iter = log2(nb); + std::cout << "\nNum iteration: " << num_iter << "\n"; + + // Define device selector as 'default' + default_selector device_selector; + + // exception handler + auto exception_handler = [](exception_list exceptionList) { + for (std::exception_ptr const& e : exceptionList) { + try { + std::rethrow_exception(e); + } catch (cl::sycl::exception const& e) { + std::terminate(); + } + } + }; + + // Create a device queue using DPC++ class queue + queue q(device_selector, exception_handler); + + std::cout << "Device: " << q.get_device().get_info() << "\n"; + + int *data = new int[nb]; + int *prefix_sum1 = new int[nb]; + int *prefix_sum2 = new int[nb]; + int *result = NULL; + + srand(seed); + + // Initialize data arrays + for (int i = 0; i < nb; i++) { + data[i] = prefix_sum1[i] = rand() % 10; + prefix_sum2[i] = 0; + } + + // Start timer + auto start = std::chrono::steady_clock::now(); + + result = ParallelPrefixSum(prefix_sum1, prefix_sum2, nb, q); + + auto end = std::chrono::steady_clock::now(); + auto timeKern = std::chrono::duration_cast(end - start).count(); + std::cout << "Kernel time: " << timeKern << " ms" << "\n"; + + //std::cout << "\ndata after transforming using parallel prefix sum result:"; + //Show(result, nb); + + bool equal = true; + + if (result[0] != data[0]) + equal = false; + else + { + for (int i = 1; i < nb; i++) { + if (result[i] != result[i - 1] + data[i]) + { + equal = false; + break; + } + } + } + + delete[] data; + delete[] prefix_sum1; + delete[] prefix_sum2; + + if (!equal) { + std::cout << "\nFailed: " << std::endl; + return -2; + } + else { + std::cout << "\nSuccess!" << std::endl; + return 0; + } +} diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt new file mode 100644 index 0000000000..f472928505 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt @@ -0,0 +1,12 @@ +set(CMAKE_CXX_COMPILER "dpcpp") +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release") + set (CMAKE_BUILD_TYPE "Release" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +cmake_minimum_required (VERSION 3.0) +project(dpc_reduce LANGUAGES CXX) +add_subdirectory (src) diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/License.txt b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/License.txt new file mode 100644 index 0000000000..9cde07f558 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/License.txt @@ -0,0 +1,8 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md new file mode 100644 index 0000000000..7a08d01177 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md @@ -0,0 +1,76 @@ +# dpc_reduce Sample + +The dpc_reduce is a simple program that calculates pi. This program is implemented using C++ and Data Parallel C++ (DPC++) for Intel(R) CPU and accelerators. + + +For comprehensive instructions regarding DPC++ Programming, go to https://software.intel.com/en-us/oneapi-programming-guide and search based on relevant terms noted in the comments. + +| Optimized for | Description +| OS | Linux* Ubuntu* 18.04, +| Hardware | Skylake with GEN9 or newer, +| Software | Intel® oneAPI DPC++ Compiler (beta) +| What you will learn | how to perform reduction with oneAPI on cpu and gpu +| Time to complete | 30 min + +## Purpose +This example demonstrates how to do reduction by using the CPU in serial mode, +the CPU in parallel mode (using TBB), the GPU using direct DPC++ coding, the +GPU using multiple steps with DPC++ Library algorithms transform and reduce, +and then finally using the DPC++ Library transform_reduce algorithm. + +All the different modes use a simple calculation for Pi. It is a well known +mathematical formula that if you integrate from 0 to 1 over the function, +(4.0 / (1+x*x) )dx the answer is pi. One can approximate this integral +by summing up the area of a large number of rectangles over this same range. + +Each of the different function calculates pi by breaking the range into many +tiny rectangles and then summing up the results. + +The parallel computations are performed using oneTBB and oneAPI DPC++ library +(oneDPL). + +## Key Implementation Details +The basic DPC++ implementation explained in the code includes accessor, +kernels, queues, buffers as well as some oneDPL library calls. + +## License +This code sample is licensed under MIT license. + +## Building the dpc_reduce program for CPU and GPU + +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system". + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System +Perform the following steps: +1. Build the program using the following 'cmake' commands +mkdir build +cd build +cmake .. +make + +2. Run the program using: +make run or src/dpc_reduce + +3. Clean the program using: +make clean + + +## Running the Sample +### Application Parameters +There are no editable parameters for this sample. + +### Example of Output +Number of steps is 1000000 +Cpu Seq calc: PI =3.14 in 0.00348 seconds +Cpu TBB calc: PI =3.14 in 0.00178 seconds +dpstd native: PI =3.14 in 0.191 seconds +dpstd native2: PI =3.14 in 0.142 seconds +dpstd native3: PI =3.14 in 0.002 seconds +dpstd native4: PI =3.14 in 0.00234 seconds +dpstd two steps: PI =3.14 in 0.00138 seconds +dpstd transform_reduce: PI =3.14 in 0.000442 seconds +success diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json new file mode 100644 index 0000000000..b8c2f8cb72 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json @@ -0,0 +1,29 @@ + { + "guid": "ECF6C8EB-753B-4107-AF64-60662CE41726", + "name": "DPC Reduce", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/oneAPI DPC++ Library/CPU and GPU"], + "description": "It models transform reduce in different ways showing capability of oneAPI.", + "toolchain": ["dpcpp"], + "languages": [{ + "cpp": {} + }], + "targetDevice": ["CPU", "GPU"], + "os": ["linux"], + "builder": ["cmake"], + "ciTests": { + "linux": [ + { + "id": "dpc_reduce", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "./src/dpc_reduce" + ] + } + ] + } +} + + diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/CMakeLists.txt new file mode 100644 index 0000000000..cc3703162b --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/CMakeLists.txt @@ -0,0 +1,24 @@ +if (NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 14) +endif() + +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE RelWithDebInfo) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -ltbb") + +# Add an executable target from source files +add_executable(${PROJECT_NAME} main.cpp) + +if(WIN32) + # Specify libraries to link with + target_link_libraries(${PROJECT_NAME} sycl ) + + # Add custom target for running + add_custom_target(run ${PROJECT_NAME}.exe) +else() + # Add custom target for running + add_custom_target(run ./${PROJECT_NAME}) +endif() diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp new file mode 100644 index 0000000000..25cf767a49 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp @@ -0,0 +1,519 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include // setprecision library +#include +#include +#include +#include +#include "dpc_common.hpp" +// Many oneAPI code samples share common include files. These +// include files are installed locally with the product installation +// and can be located at %ONEAPI_ROOT%\dev-utilities\latest\include +// on your development system. + +using namespace sycl; + +// cpu_seq is a simple sequential CPU routine +// that calculates all the slices and then +// does a reduction. +float calc_pi_cpu_seq(int num_steps) { + float step = 1.0 / (float)num_steps; + float x; + float sum = 0.0; + for (int i = 1; i < num_steps; i++) { + x = (i - 0.5) * step; + sum = sum + 4.0 / (1.0 + x * x); + } + return sum / (float)num_steps; +} + +// cpu_tbb is a simple parallel_reduce tbb routine +// that calculates all the slices and then +// uses tbb reduce to combine results. +float calc_pi_cpu_tbb(int num_steps) { + float step = 1.0 / (float)num_steps; + + auto tbbtotal = + tbb::parallel_reduce(tbb::blocked_range(1, num_steps), 0.0, + [=](tbb::blocked_range r, float running_total) { + float y; + for (int i = r.begin(); i != r.end(); i++) { + y = (i - 0.5) * step; + running_total += 4.0 / (1.0 + y * y); + } + return running_total; + }, + std::plus()); + return tbbtotal / (float)num_steps; +} + +// dpstd_native uses a parallel_for to fill +// a buffer with all the slice calculations and +// then uses a single_task to combine all the results +// This is not the highest performing example but shows +// how to do calculations directly in dpc++ with +// mininmal complexity. +template +float calc_pi_dpstd_native(size_t num_steps, Policy&& policy) { + float step = 1.0 / (float)num_steps; + + float data[num_steps]; + + // Create buffer using host allocated "data" array + buffer buf{data, range<1>{num_steps}}; + + policy.queue().submit([&](handler& h) { + auto writeresult = buf.get_access(h); + h.parallel_for(range<1>{num_steps}, [=](id<1> idx) { + float x = ((float)idx[0] - 0.5) / (float)num_steps; + writeresult[idx[0]] = 4.0f / (1.0 + x * x); + }); + }); + policy.queue().wait(); + + // Single task is needed here to make sure + // data is not written over. + policy.queue().submit([&](handler& h) { + auto a = buf.get_access(h); + h.single_task([=]() { + for (int i = 1; i < num_steps; i++) a[0] += a[i]; + }); + }); + policy.queue().wait(); + + float mynewresult = + buf.get_access()[0] / (float)num_steps; + return mynewresult; +} + +// This option uses a parallel for to fill the array, and then use a single +// task to reduce into groups and then use cpu for final reduction. +template +float calc_pi_dpstd_native2(size_t num_steps, Policy&& policy, int group_size) { + float step = 1.0 / (float)num_steps; + + float data[num_steps]; + float myresult = 0.0; + + // Create buffer using host allocated "data" array + buffer buf{data, range<1>{num_steps}}; + + // fill buffer with calculations + policy.queue().submit([&](handler& h) { + auto writeresult = buf.get_access(h); + h.parallel_for(range<1>{num_steps}, [=](id<1> idx) { + float x = ((float)idx[0] - 0.5) / (float)num_steps; + writeresult[idx[0]] = 4.0f / (1.0 + x * x); + }); + }); + policy.queue().wait(); + + size_t num_groups = num_steps / group_size; + float c[num_groups]; + // create a number of groups and do a local reduction + // within these groups using single_task. Store each + // result within the output of bufc + for (int i = 0; i < num_groups; i++) c[i] = 0; + buffer bufc{c, range<1>{num_groups}}; + for (int j = 0; j < num_groups; j++) { + policy.queue().submit([&](handler& h) { + auto my_a = buf.get_access(h); + auto my_c = bufc.get_access(h); + h.single_task([=]() { + for (int i = 0 + group_size * j; i < group_size + group_size * j; i++) + my_c[j] += my_a[i]; + }); + }); + } + policy.queue().wait(); + + auto src = bufc.get_access(); + + // Sum up results on CPU + float mynewresult = 0.0; + for (int i = 0; i < num_groups; i++) mynewresult += src[i]; + + return mynewresult / (float)num_steps; +} + +// Function operator used as transform operation in transform-reduce operations +// implemented below. +struct my_no_op { + template + Tp&& operator()(Tp&& a) const { + return std::forward(a); + } +}; + +// Structure slice area performs the calculations for +// each rectangle that will be summed up. +struct slice_area { + int num; + slice_area(int num_steps) { num = num_steps; } + + template + float operator()(T&& i) { + float x = ((float)i - 0.5) / (float)num; + return 4.0f / (1.0f + (x * x)); + }; +}; + +// This option uses a parallel for to fill the buffer and then +// uses a tranform_init with plus/no_op and then +// a local reduction then global reduction. +template +float calc_pi_dpstd_native3(size_t num_steps, int groups, Policy&& policy) { + float data[num_steps]; + + // Create buffer using host allocated "data" array + buffer buf{data, range<1>{num_steps}}; + + // fill the buffer with the calculation using parallel for + policy.queue().submit([&](handler& h) { + auto writeresult = buf.get_access(h); + h.parallel_for(range<1>{num_steps}, [=](id<1> idx) { + float x = (float)idx[0] / (float)num_steps; + writeresult[idx[0]] = 4.0f / (1.0f + x * x); + }); + }); + policy.queue().wait(); + + // Calc_begin and calc_end are iterators pointing to + // beginning and end of the buffer + auto calc_begin = oneapi::dpl::begin(buf); + auto calc_end = oneapi::dpl::end(buf); + + using Functor = oneapi::dpl::unseq_backend::walk_n; + float result; + + // Functor will do nothing for tranform_init and will use plus for reduce. + // In this example we have done the calculation and filled the buffer above + // The way transform_init works is that you need to have the value already + // populated in the buffer. + auto tf_init = + oneapi::dpl::unseq_backend::transform_init, + Functor>{std::plus(), + Functor{my_no_op()}}; + + auto combine = std::plus(); + auto brick_reduce = + oneapi::dpl::unseq_backend::reduce, float>{ + std::plus()}; + auto workgroup_size = + policy.queue() + .get_device() + .template get_info(); + auto max_comp_u = policy.queue() + .get_device() + .template get_info(); + auto n_groups = (num_steps - 1) / workgroup_size + 1; + n_groups = + std::min(decltype(n_groups)(max_comp_u), + n_groups); // make groups max number of compute units or less + + // 0. Create temporary global buffer to store temporary value + auto temp_buf = buffer(range<1>(n_groups)); + // 1. Reduce over each work_group + auto local_reduce_event = + policy.queue().submit([&buf, &temp_buf, &brick_reduce, &tf_init, + num_steps, n_groups, workgroup_size](handler& h) { + auto access_buf = buf.template get_access(h); + auto temp_acc = + temp_buf.template get_access(h); + // Create temporary local buffer + accessor + temp_buf_local(range<1>(workgroup_size), h); + h.parallel_for(nd_range<1>(range<1>(n_groups * workgroup_size), + range<1>(workgroup_size)), + [=](nd_item<1> item_id) mutable { + auto global_idx = item_id.get_global_id(0); + // 1. Initialization (transform part). + tf_init(item_id, global_idx, access_buf, num_steps, + temp_buf_local); + // 2. Reduce within work group + float local_result = brick_reduce( + item_id, global_idx, num_steps, temp_buf_local); + if (item_id.get_local_id(0) == 0) { + temp_acc[item_id.get_group(0)] = local_result; + } + }); + }); + + // 2. global reduction + auto reduce_event = local_reduce_event; + if (n_groups > 1) { + auto countby2 = decltype(n_groups)(1); + do { + reduce_event = policy.queue().submit([&reduce_event, &temp_buf, &combine, + countby2, n_groups](handler& h) { + h.depends_on(reduce_event); + auto temp_acc = + temp_buf.template get_access(h); + h.parallel_for(range<1>(n_groups), [=](item<1> item_id) mutable { + auto global_idx = item_id.get_linear_id(); + + if (global_idx % (2 * countby2) == 0 && + global_idx + countby2 < n_groups) { + temp_acc[global_idx] = + combine(temp_acc[global_idx], temp_acc[global_idx + countby2]); + } + }); + }); + countby2 *= 2; + } while (countby2 < n_groups); + } + + float answer = temp_buf.template get_access()[0]; + result = answer / (float)num_steps; + return result; +} + +// dpstd_native4 fills a buffer with number 1...num_steps and then +// calls transform_init to calculate the slices and then +// does a reduction in two steps - global and then local. +template +float calc_pi_dpstd_native4(size_t num_steps, int groups, Policy&& policy) { + std::vector data(num_steps); + float result = 0.0; + + buffer buf2{data.data(), range<1>{num_steps}}; + + // fill buffer with 1...num_steps + policy.queue().submit([&](handler& h) { + auto writeresult = buf2.get_access(h); + h.parallel_for(range<1>{num_steps}, + [=](id<1> idx) { writeresult[idx[0]] = (float)idx[0]; }); + }); + policy.queue().wait(); + + auto calc_begin = oneapi::dpl::begin(buf2); + auto calc_end = oneapi::dpl::end(buf2); + + using Functor2 = oneapi::dpl::unseq_backend::walk_n; + + // The buffer has 1...num it at and now we will use that as an input + // to the slice structue which will calculate the area of each + // rectangle. + auto tf_init = + oneapi::dpl::unseq_backend::transform_init, + Functor2>{ + std::plus(), Functor2{slice_area(num_steps)}}; + + auto combine = std::plus(); + auto brick_reduce = + oneapi::dpl::unseq_backend::reduce, float>{ + std::plus()}; + + // get workgroup_size from the device + auto workgroup_size = + policy.queue() + .get_device() + .template get_info(); + + // get number of compute units from device. + auto max_comp_u = policy.queue() + .get_device() + .template get_info(); + + auto n_groups = (num_steps - 1) / workgroup_size + 1; + + // use the smaller of the number of workgroups device has or the + // number of steps/workgroups + n_groups = std::min(decltype(n_groups)(max_comp_u), n_groups); + + // Create temporary global buffer to store temporary value + auto temp_buf = buffer(range<1>(n_groups)); + + // Reduce over each work_group + auto local_reduce_event = + policy.queue().submit([&buf2, &temp_buf, &brick_reduce, &tf_init, + num_steps, n_groups, workgroup_size](handler& h) { + // grab access to the previous input + auto access_buf = buf2.template get_access(h); + auto temp_acc = + temp_buf.template get_access(h); + // Create temporary local buffer + accessor + temp_buf_local(range<1>(workgroup_size), h); + h.parallel_for(nd_range<1>(range<1>(n_groups * workgroup_size), + range<1>(workgroup_size)), + [=](nd_item<1> item_id) mutable { + auto global_idx = item_id.get_global_id(0); + // 1. Initialization (transform part). Fill local + // memory + tf_init(item_id, global_idx, access_buf, num_steps, + temp_buf_local); + // 2. Reduce within work group + float local_result = brick_reduce( + item_id, global_idx, num_steps, temp_buf_local); + if (item_id.get_local_id(0) == 0) { + temp_acc[item_id.get_group(0)] = local_result; + } + }); + }); + + // global reduction + auto reduce_event = local_reduce_event; + if (n_groups > 1) { + auto countby2 = decltype(n_groups)(1); + do { + reduce_event = policy.queue().submit([&reduce_event, &temp_buf, &combine, + countby2, n_groups](handler& h) { + h.depends_on(reduce_event); + auto temp_acc = + temp_buf.template get_access(h); + h.parallel_for(range<1>(n_groups), [=](item<1> item_id) mutable { + auto global_idx = item_id.get_linear_id(); + + if (global_idx % (2 * countby2) == 0 && + global_idx + countby2 < n_groups) { + temp_acc[global_idx] = + combine(temp_acc[global_idx], temp_acc[global_idx + countby2]); + } + }); + }); + countby2 *= 2; + } while (countby2 < n_groups); + } + float answer = temp_buf.template get_access()[0]; + result = answer / (float)num_steps; + + return result; +} + +// This function shows the use of two different DPC++ library calls. +// The first is a transform calls which will fill a buff with the +// calculations of each small rectangle. The second call is the reduce +// call which sums up the results of all the elements in the buffer. +template +float calc_pi_dpstd_two_steps_lib(int num_steps, Policy&& policy) { + float step = 1.0 / (float)num_steps; + + buffer calc_values{num_steps}; + auto calc_begin2 = oneapi::dpl::begin(calc_values); + auto calc_end2 = oneapi::dpl::end(calc_values); + + // use DPC++ library call transform to fill the buffer with + // the area calculations for each rectangle. + std::transform(policy, oneapi::dpl::counting_iterator(1), + oneapi::dpl::counting_iterator(num_steps), calc_begin2, + [=](int i) { + float x = (((float)i - 0.5f) / (float)(num_steps)); + return (4.0f / (1.0f + x * x)); + }); + + policy.queue().wait(); + + // use the DPC++ library call to reduce the array using plus + float result = + std::reduce(policy, calc_begin2, calc_end2, 0.0f, std::plus()); + policy.queue().wait(); + + result = result / (float)num_steps; + + return result; +} + +// This function uses the DPC++ library call +// transform reduce. It does everything in one library +// call. +template +float calc_pi_dpstd_onestep(int num_steps, Policy& policy) { + float step = 1.0f / (float)num_steps; + + float total = std::transform_reduce( + policy, oneapi::dpl::counting_iterator(1), + oneapi::dpl::counting_iterator(num_steps), 0.0f, std::plus(), + [=](int i) { + float x = (float)(((float)i - 0.5f) / (float(num_steps))); + return (4.0f / (1.0f + x * x)); + }); + total = total * (float)step; + + return total; +} + +int main(int argc, char** argv) { + int num_steps = 1000000; + printf("Number of steps is %d\n", num_steps); + int groups = 10000; + + float pi; + queue myQueue{property::queue::in_order()}; + auto policy = oneapi::dpl::execution::make_device_policy( + queue(default_selector{}, dpc_common::exception_handler)); + + // Since we are using JIT compiler for samples, + // we need to run each step once to allow for compile + // to occur before we time execution of function. + pi = calc_pi_dpstd_native(num_steps, policy); + pi = calc_pi_dpstd_native2(num_steps, policy, groups); + pi = calc_pi_dpstd_native3(num_steps, groups, policy); + pi = calc_pi_dpstd_native4(num_steps, groups, policy); + + pi = calc_pi_dpstd_two_steps_lib(num_steps, policy); + pi = calc_pi_dpstd_onestep(num_steps, policy); + + dpc_common::TimeInterval T; + pi = calc_pi_cpu_seq(num_steps); + auto stop = T.Elapsed(); + std::cout << "Cpu Seq calc: \t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop << " seconds\n"; + + dpc_common::TimeInterval T2; + pi = calc_pi_cpu_tbb(num_steps); + auto stop2 = T2.Elapsed(); + std::cout << "Cpu TBB calc: \t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop2 << " seconds\n"; + + dpc_common::TimeInterval T3; + pi = calc_pi_dpstd_native(num_steps, policy); + auto stop3 = T3.Elapsed(); + std::cout << "dpstd native:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3 << " seconds\n"; + + dpc_common::TimeInterval T3a; + pi = calc_pi_dpstd_native2(num_steps, policy, groups); + auto stop3a = T3a.Elapsed(); + std::cout << "dpstd native2:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3a << " seconds\n"; + + dpc_common::TimeInterval T3b; + pi = calc_pi_dpstd_native3(num_steps, groups, policy); + auto stop3b = T3b.Elapsed(); + std::cout << "dpstd native3:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3b << " seconds\n"; + + dpc_common::TimeInterval T3c; + pi = calc_pi_dpstd_native4(num_steps, groups, policy); + auto stop3c = T3c.Elapsed(); + std::cout << "dpstd native4:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3c << " seconds\n"; + + dpc_common::TimeInterval T4; + pi = calc_pi_dpstd_two_steps_lib(num_steps, policy); + auto stop4 = T4.Elapsed(); + std::cout << "dpstd two steps:\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop4 << " seconds\n"; + + dpc_common::TimeInterval T5; + pi = calc_pi_dpstd_onestep(num_steps, policy); + auto stop5 = T5.Elapsed(); + std::cout << "dpstd transform_reduce: "; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop5 << " seconds\n"; + + std::cout << "success\n"; + return 0; +} diff --git a/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/CMakeLists.txt new file mode 100644 index 0000000000..069c03849e --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/CMakeLists.txt @@ -0,0 +1,12 @@ +set(CMAKE_CXX_COMPILER "icpx") +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release") + set (CMAKE_BUILD_TYPE "Release" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +cmake_minimum_required (VERSION 3.0) +project(openmp_reduction LANGUAGES CXX) +add_subdirectory (src) diff --git a/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/License.txt b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/License.txt new file mode 100644 index 0000000000..9cde07f558 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/License.txt @@ -0,0 +1,8 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/README.md b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/README.md new file mode 100644 index 0000000000..3836e7fc0e --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/README.md @@ -0,0 +1,67 @@ +# openmp_reduction Sample + +The openmp_reduction sample is a simple program that calculates pi. This program is implemented using C++ and openMP for Intel(R) CPU and accelerators. + +For comprehensive instructions regarding DPC++ Programming, go to https://software.intel.com/en-us/oneapi-programming-guide and search based on relevant terms noted in the comments. + +| Optimized for | Description +| OS | Linux* Ubuntu* 18.04, +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++ Compiler (beta) +| What you will learn | How to run openMP on cpu as well as GPU offload +| Time to complete | 10 min + +## Purpose +This example demonstrates how to do reduction by using the CPU in serial mode, +the CPU in parallel mode (using openMP), the GPU using openMP offloading. + +All the different modes use a simple calculation for Pi. It is a well known +mathematical formula that if you integrate from 0 to 1 over the function, +(4.0 / (1+x*x) )dx the answer is pi. One can approximate this integral +by summing up the area of a large number of rectangles over this same range. + +Each of the different functions calculates pi by breaking the range into many +tiny rectangles and then summing up the results. + +## Key Implementation Details +This code shows how to use OpenMP on the CPU host as well as using target offload capabilities. + +## License +This code sample is licensed under MIT license. + +## Building the dpc_reduce program for CPU and GPU + +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system". + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System +Perform the following steps: + +mkdir build +cd build +cmake .. + +1. Build the program using the following make commands +make + +2. Run the program using: +make run or src/openmp_reduction + +3. Clean the program using: +make clean + + +## Running the Sample + +### Application Parameters +There are no editable parameters for this sample. + +### Example of Output (result vary depending on hardware) +Number of steps is 1000000 +Cpu Seq calc: PI =3.14 in 0.00105 seconds +Host OpenMP: PI =3.14 in 0.0010 seconds +Offload OpenMP: PI =3.14 in 0.0005 seconds +success diff --git a/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/sample.json b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/sample.json new file mode 100644 index 0000000000..78b550e82c --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/sample.json @@ -0,0 +1,29 @@ + { + "guid": "ECF6C8EB-753B-4107-AF64-60662CE41726", + "name": "DPC Reduce", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/oneAPI DPC++ Library/CPU and GPU"], + "description": "It models transform reduce in different ways showing capability of oneAPI.", + "toolchain": ["dpcpp"], + "languages": [{ + "cpp": {} + }], + "targetDevice": ["CPU", "GPU"], + "os": ["linux"], + "builder": ["cmake"], + "ciTests": { + "linux": [ + { + "id": "dpc_reduce", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make ", + "./src/openmp_reduction" + ] + } + ] + } +} + + diff --git a/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/src/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/src/CMakeLists.txt new file mode 100644 index 0000000000..90721a5f66 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/src/CMakeLists.txt @@ -0,0 +1,24 @@ +if (NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 14) +endif() + +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE RelWithDebInfo) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fiopenmp -fopenmp-targets=spir64 -fsycl") +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") + +# Add an executable target from source files +add_executable(${PROJECT_NAME} main.cpp) + +if(WIN32) + # Specify libraries to link with + target_link_libraries(${PROJECT_NAME} sycl ) + + # Add custom target for running + add_custom_target(run ${PROJECT_NAME}.exe) +else() + # Add custom target for running + add_custom_target(run ./${PROJECT_NAME}) +endif() diff --git a/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/src/main.cpp b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/src/main.cpp new file mode 100644 index 0000000000..b36aae7ab5 --- /dev/null +++ b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/src/main.cpp @@ -0,0 +1,106 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include // setprecision library +#include +#include "dpc_common.hpp" +// Many oneAPI code samples share common include files. These +// include files are installed locally with the product installation +// and can be located at %ONEAPI_ROOT%\dev-utilities\latest\include +// on your development system. + + +// cpu_seq_calc_pi is a simple sequential CPU routine +// that calculates all the slices and then +// does a reduction. +float cpu_seq_calc_pi(int num_steps) { + float step = 1.0 / (float)num_steps; + float x; + float pi; + float sum = 0.0; + for (int i = 1; i < num_steps; i++) { + x = ((float)i - 0.5f) * step; + sum = sum + 4.0f / (1.0f + x * x); + } + pi = sum * step; + return pi; +} + +// openmp_host_calc_pi is a simple parallel +// calcuation that uses openmp running +// on the host. By default openmp +// will use all the cores available +// and execute the code in parallel and +// then perform a reduction. +float openmp_host_calc_pi(int num_steps) { + float step = (1.0f / num_steps); + float pi = 0.0; + float sum = 0.0; +#pragma omp parallel for reduction(+ : sum) + for (int i = 1; i < num_steps; i++) { + float x = ((float)i - 0.5f) * step; + sum = sum + 4.0f / (1.0f + x * x); + } + pi = step * sum; + return pi; +} + +// openmp_device_calc_pi is a simple parallel +// calcuation that uses openmp running +// on the device through the use of the +// target specifier. +// This will execute the code in parallel. + +float openmp_device_calc_pi(int num_steps) { + float pi = 0.0; + float step = (1.0f / num_steps); + float sum = 0.0; +#pragma omp target teams distribute parallel for reduction(+ : sum) + for (int i = 1; i < num_steps; i++) { + float x = ((float)i - 0.5f) * step; + sum = sum + 4.0f / (1.0 + x * x); + } + pi = sum * step; + return pi; +} + +int main(int argc, char** argv) { + int num_steps = 1000000; + printf("Number of steps is %d\n", num_steps); + float pi; + + // Due to the overhead associated with + // JIT, run the offload calculation once + // that allows code to be compiled. Execution + // time is measured the 2nd time you run it. + pi = openmp_device_calc_pi(num_steps); + + dpc_common::TimeInterval T; + pi = cpu_seq_calc_pi(num_steps); + auto stop = T.Elapsed(); + std::cout << "Cpu Seq calc: \t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop << " seconds" + << "\n"; + + dpc_common::TimeInterval T2; + pi = openmp_host_calc_pi(num_steps); + auto stop2 = T2.Elapsed(); + std::cout << "Host OpenMP:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop2 << " seconds" + << "\n"; + + dpc_common::TimeInterval T3; + pi = openmp_device_calc_pi(num_steps); + auto stop3 = T3.Elapsed(); + std::cout << "Offload OpenMP:\t\t"; + std::cout << std::setprecision(3) << "PI =" << pi; + std::cout << " in " << stop3 << " seconds" + << "\n"; + + std::cout << "success\n"; + return 0; +} diff --git a/DirectProgramming/DPC++/SpectralMethods/DiscreteCosineTransform/README.md b/DirectProgramming/DPC++/SpectralMethods/DiscreteCosineTransform/README.md index fd706c0b84..482899704b 100644 --- a/DirectProgramming/DPC++/SpectralMethods/DiscreteCosineTransform/README.md +++ b/DirectProgramming/DPC++/SpectralMethods/DiscreteCosineTransform/README.md @@ -8,7 +8,7 @@ For comprehensive instructions regarding DPC++ Programming, go to https://softwa |:--- |:--- | OS | Linux* Ubuntu* 18.04; Windows 10 | Hardware | Skylake with GEN9 or newer -| Software | Intel® oneAPI DPC++ Compiler beta; +| Software | Intel® oneAPI DPC++/C++ Compiler; | What you will learn | How to parallel process image data using DPC++ for producing a Discrete Cosine Transform | Time to complete | 15 minutes diff --git a/DirectProgramming/DPC++/SpectralMethods/DiscreteCosineTransform/sample.json b/DirectProgramming/DPC++/SpectralMethods/DiscreteCosineTransform/sample.json index a6ff50dad1..0f1a243409 100644 --- a/DirectProgramming/DPC++/SpectralMethods/DiscreteCosineTransform/sample.json +++ b/DirectProgramming/DPC++/SpectralMethods/DiscreteCosineTransform/sample.json @@ -1,7 +1,7 @@ { "name": "Discrete Cosine Transform", "description": "An image processing algorithm as seen in the JPEG compression standard.", - "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU"], + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/Intel® oneAPI DPC++/C++ Compiler/CPU and GPU"], "os": ["linux", "windows"], "builder": ["ide", "cmake"], "languages": [{"cpp":{}}], diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/README.md b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/README.md index 6459b25e05..346752f830 100644 --- a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/README.md +++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/README.md @@ -12,8 +12,8 @@ and search based on relevant terms noted in the comments. |:--- |:--- | OS | Linux Ubuntu 18.04 | Hardware | Skylake with GEN9 or newer -| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) -| What you will learn | How to simulate 1D Heat Transfer using Intel DPC++ compiler +| Software | Intel® oneAPI DPC++/C++ Compiler +| What you will learn | How to simulate 1D Heat Transfer using Intel® oneAPI DPC++/C++ Compiler | Time to complete | 10 minutes diff --git a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/README.md b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/README.md index 604dd14b56..03b33a9171 100644 --- a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/README.md +++ b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/README.md @@ -17,8 +17,8 @@ and search based on relevant terms noted in the comments. |:--- |:--- | OS | Linux Ubuntu 18.04 | Hardware | Skylake with GEN9 or newer -| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) -| What you will learn | How to offload the computation to GPU using Intel DPC++ compiler +| Software | Intel® oneAPI DPC++/C++ Compiler +| What you will learn | How to offload the computation to GPU using Intel® oneAPI DPC++/C++ Compiler | Time to complete | 10 minutes @@ -53,9 +53,12 @@ global ID variable) for a single time step. This code sample is licensed under MIT license. - ## Building the `iso2dfd` Program for CPU and GPU +### Include Files + +The include folder is located at %ONEAPI_ROOT%\dev-utilities\latest\include on your development system. + ### Running Samples In DevCloud If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, @@ -92,18 +95,6 @@ Perform the following steps: Right click on the project in Solution explorer and select Rebuild. From top menu select Debug -> Start without Debugging. ->If you see the following error message when compiling this sample: -> -``` -Error 'dpc_common.hpp' file not found -``` ->You need to add the following directory to the list of include folders, that are required by your project, in your project's Visual Studio project property panel. The missing include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. - -* Build the program using MSBuild - Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for VS2019" - Run - MSBuild iso2dfd.sln /t:Rebuild /p:Configuration="Release" - - ## Running the Sample ### Application Parameters diff --git a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/src/iso2dfd.cpp b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/src/iso2dfd.cpp index 710d87051b..62bd936ccf 100644 --- a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/src/iso2dfd.cpp +++ b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/src/iso2dfd.cpp @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/README.md b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/README.md index 516f9c1ba6..67005704b9 100644 --- a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/README.md +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/README.md @@ -8,11 +8,11 @@ For comprehensive instructions regarding DPC++ Programming, go to https://softwa |:--- |:--- | OS | Linux* Ubuntu* 18.04; Windows 10 | Hardware | Skylake with GEN9 or newer -| Software | Intel® oneAPI DPC++ Compiler beta; -| What you will learn | How to offload the computation to GPU using Intel DPC++ compiler +| Software | Intel® oneAPI DPC++/C++ Compiler; +| What you will learn | How to offload the computation to GPU using Intel® oneAPI DPC++/C++ Compiler | Time to complete | 15 minutes -Performance number tabulation [if applicable -- **NO for beta**] +Performance number tabulation | iso3dfd sample | Performance data |:--- |:--- diff --git a/DirectProgramming/DPC++/StructuredGrids/particle-diffusion/README.md b/DirectProgramming/DPC++/StructuredGrids/particle-diffusion/README.md index 50c61fa567..e5a208706b 100644 --- a/DirectProgramming/DPC++/StructuredGrids/particle-diffusion/README.md +++ b/DirectProgramming/DPC++/StructuredGrids/particle-diffusion/README.md @@ -14,11 +14,11 @@ and search based on relevant terms noted in the comments. |:--- |:--- | OS | Linux Ubuntu 18.04; Windows 10 or Windows Server 2017 | Hardware | Kaby Lake with GEN9 or newer -| Software | Intel Data Parallel C++ Compiler (beta) -| What you will learn | How to offload the computation to GPU using Intel DPC++ compiler +| Software | Intel® oneAPI DPC++/C++ Compiler +| What you will learn | How to offload the computation to GPU using Intel® oneAPI DPC++/C++ Compiler | Time to complete | 15 minutes -Performance number tabulation [if applicable] +Performance number tabulation | motionsim sample | Performance data |:--- |:--- @@ -104,18 +104,6 @@ Perform the following steps: Right click on the project in Solution explorer and select Rebuild From top menu select Debug -> Start without Debugging ->If you see the following error message when compiling this sample: -> -``` -Error 'dpc_common.hpp' file not found -``` ->You need to add the following directory to the list of include folders, that are required by your project, in your project's Visual Studio project property panel. The missing include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. - - * Build the program using MSBuild - Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for VS2019" - Run - MSBuild Particle_Diffusion.sln /t:Rebuild /p:Configuration="Release" - - ## Running the Sample ### Application Parameters diff --git a/DirectProgramming/DPC++/StructuredGrids/particle-diffusion/src/motionsim.cpp b/DirectProgramming/DPC++/StructuredGrids/particle-diffusion/src/motionsim.cpp index fda492d9e0..efbdb7c728 100644 --- a/DirectProgramming/DPC++/StructuredGrids/particle-diffusion/src/motionsim.cpp +++ b/DirectProgramming/DPC++/StructuredGrids/particle-diffusion/src/motionsim.cpp @@ -25,7 +25,6 @@ // #include -#include #include #include #include diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/CMakeLists.txt new file mode 100755 index 0000000000..6ae6386d49 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(CRR) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/README.md b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/README.md new file mode 100755 index 0000000000..ab98bae8d7 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/README.md @@ -0,0 +1,224 @@ +# CRR Binomial Tree Model for Option Pricing +An FPGA-optimized reference design computing the Cox-Ross-Rubinstein (CRR) binomial tree model with Greeks for American exercise options. + +The [FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a resource for general target-independent DPC++ programming. Additional reference material specific to option pricing algorithms is provided in the References section of this README. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | Review a high performance DPC++ design optimized for FPGA +| Time to complete | 1 hr (not including compile time) + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + + +**Performance** +Please refer to performance disclaimer at the end of this README. + +| Device | Throughput +|:--- |:--- +| Intel® PAC with Intel Arria® 10 GX FPGA | 118 assets/s +| Intel® PAC with Intel Stratix® 10 SX FPGA | 243 assets/s + + +## Purpose +This sample implements the Cox-Ross-Rubinstein (CRR) binomial tree model that is used in the finance field for American exercise options with five Greeks (delta, gamma, theta, vega and rho). The simple idea is to model all possible assets price paths using a binomial tree. + +## Key Implementation Details + +### Design Inputs +This design reads inputs from the `ordered_inputs.csv` file. The inputs are: + +| Input | Description +--- |--- +| `n_steps` | Number of time steps in the binomial tree. The maximum `n_steps` in this design is 8189. +| `cp` | -1 or 1 represents put and call options, respectively. +| `spot` | Spot price of the underlying price. +| `fwd` | Forward price of the underlying price. +| `strike` | Exercise price of the option. +| `vol` | Percent volatility that the design reads as a decimal value. +| `df` | Discount factor to option expiry. +| `t` | Time, in years, to the maturity of the option. + +### Design Outputs +This design writes outputs to the `ordered_outputs.csv` file. The outputs are: + +| Output | Description +--- |--- +| `value` | Option price +| `delta` | Measures the rate of change of the theoretical option value with respect to changes in the underlying asset's price. +| `gamma` | Measures the rate of change in the `delta` with respect to changes in the underlying price. +| `vega` | Measures sensitivity to volatility. +| `theta` | Measures the sensitivity of the value of the derivative to the passage of time. +| `rho` | Measures sensitivity to the interest of rate. + +### Design Correctness +This design tests the correctness of the optimized FPGA code by comparing its output to a golden result computed on the CPU. + +### Design Performance +This design measures the FPGA performance to determine how many assets can be processed per second. + +## License +This code sample is licensed under MIT license. + +## Building the CRR Program + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (FPGA) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 48h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + +### On a Windows* System +Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. + +1. Enter the source file directory. + ``` + cd src + ``` + +2. Compile the design. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + ninja fpga_emu + ``` + + * Generate the optimization report: + + ``` + ninja report + ``` + If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use: + ``` + ninja report_s10_pac + ``` + * Compiling for FPGA hardware is not yet supported on Windows. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Running the Reference Design + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./crr.fpga_emu [-o=] (Linux) + + crr.fpga_emu.exe [-o=] (Windows) + ``` + 2. Run the sample on the FPGA device: + ``` + ./crr.fpga [-o=] (Linux) + ``` + +### Application Parameters + +| Argument | Description +--- |--- +| `` | Optional argument that provides the input data. The default file is `/data/ordered_inputs.csv` +| `-o=` | Optional argument that specifies the name of the output file. The default name of the output file is `ordered_outputs.csv`. + +### Example of Output +``` +============ Correctness Test ============= +Running analytical correctness checks... +CPU-FPGA Equivalence: PASS + +============ Throughput Test ============= +Avg throughput: 66.2 assets/s +``` + +## Additional Design Information + +### Source Code Explanation + +| File | Description +--- |--- +| `main.cpp` | Contains both host code and SYCL* kernel code. +| `CRR_common.hpp` | Header file for `main.cpp`. Contains the data structures needed for both host code and SYCL* kernel code. + + + +### Backend Compiler Flags Used + +| Flag | Description +--- |--- +`-Xshardware` | Target FPGA hardware (as opposed to FPGA emulator) +`-Xsdaz` | Denormals are zero +`-Xsrounding=faithful` | Rounds results to either the upper or lower nearest single-precision numbers +`-Xsparallel=2` | Uses 2 cores when compiling the bitstream through Quartus +`-Xsseed=2` | Uses seed 2 during Quartus, yields slightly higher fMAX + +### Preprocessor Define Flags + +| Flag | Description +--- |--- +`-DOUTER_UNROLL=1` | Uses the value 1 for the constant OUTER_UNROLL, controls the number of CRRs that can be processed in parallel +`-DINNER_UNROLL=64` | Uses the value 64 for the constant INNER_UNROLL, controls the degree of parallelization within the calculation of 1 CRR +`-DOUTER_UNROLL_POW2=1` | Uses the value 1 for the constant OUTER_UNROLL_POW2, controls the number of memory banks + + +NOTE: The Xsseed, DOUTER_UNROLL, DINNER_UNROLL and DOUTER_UNROLL_POW2 values differ depending on the board being targeted. More information about the unroll factors can be found in `/src/CRR_common.hpp`. + +### Performance disclaimers + +Tests document performance of components on a particular test, in specific systems. Differences in hardware, software, or configuration will affect actual performance. Consult other sources of information to evaluate performance as you consider your purchase. For more complete information about performance and benchmark results, visit [www.intel.com/benchmarks](www.intel.com/benchmarks). + +Performance results are based on testing as of July 20, 2020 and may not reflect all publicly available security updates. See configuration disclosure for details. No product or component can be absolutely secure. + +Intel technologies’ features and benefits depend on system configuration and may require enabled hardware, software or service activation. Performance varies depending on system configuration. Check with your system manufacturer or retailer or learn more at [intel.com](www.intel.com). + +The performance was measured by Intel on July 20, 2020 + +Intel and the Intel logo are trademarks of Intel Corporation or its subsidiaries in the U.S. and/or other countries. + +(C) Intel Corporation. + +### References + +[Khronous SYCL Resources](https://www.khronos.org/sycl/resources) + +[Binomial options pricing model](https://en.wikipedia.org/wiki/Binomial_options_pricing_model) + +[Wike page for finance Greeks](https://en.wikipedia.org/wiki/Greeks_(finance)) + +[OpenCL Intercept Layer](https://github.com/intel/opencl-intercept-layer) + diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.sln b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.sln new file mode 100755 index 0000000000..a95fce9c30 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "crr", "crr.vcxproj", "{8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}.Debug|x64.ActiveCfg = Debug|x64 + {8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}.Debug|x64.Build.0 = Debug|x64 + {8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}.Release|x64.ActiveCfg = Release|x64 + {8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {6887ACDD-3E54-4396-A921-99C630333932} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj new file mode 100755 index 0000000000..62a523e96c --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj @@ -0,0 +1,165 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + + + + + 15.0 + {8eb512ff-4487-4fec-9b88-8c0da734b1b2} + Win32Proj + crr + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR -DOUTER_UNROLL=1 -DINNER_UNROLL=64 -DOUTER_UNROLL_POW2=1 %(AdditionalOptions) + false + $(IntDir)crr.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR -DOUTER_UNROLL=1 -DINNER_UNROLL=64 -DOUTER_UNROLL_POW2=1 %(AdditionalOptions) + $(IntDir)crr.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj.user b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj.user new file mode 100755 index 0000000000..9115b3f275 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj.user @@ -0,0 +1,14 @@ + + + + false + + + ./src/data/ordered_inputs.csv -o=./src/data/ordered_outputs.csv + WindowsLocalDebugger + + + ./src/data/ordered_inputs.csv -o=./src/data/ordered_outputs.csv + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/sample.json b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/sample.json new file mode 100755 index 0000000000..6155ce223d --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/sample.json @@ -0,0 +1,51 @@ +{ + "guid": "D725E06E-0ECE-44F8-910D-AD1A8C89ED89", + "name": "CRR Binomial Tree Model for Option Pricing", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Reference Designs"], + "description": "FPGA-optimized reference design of the Cox-Ross-Rubinstein (CRR) binomial tree model with Greeks for American exercise options", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "builder": ["ide", "cmake"], + "targetDevice": ["FPGA"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./crr.fpga_emu ./src/data/ordered_inputs.csv -o=./src/data/ordered_outputs.csv" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "crr.fpga_emu.exe ./data/ordered_inputs.csv -o=./data/ordered_outputs.csv" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CMakeLists.txt new file mode 100755 index 0000000000..8c56a699ad --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CMakeLists.txt @@ -0,0 +1,116 @@ +set(SOURCE_FILE main.cpp) +set(TARGET_NAME crr) +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) +set(REPORTS_TARGET ${TARGET_NAME}_report) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Design specific constant values +set(OUTER_UNROLL_A10 1) +set(INNER_UNROLL_A10 64) +set(OUTER_UNROLL_POW2_A10 1) +set(OUTER_UNROLL_S10 2) +set(INNER_UNROLL_S10 64) +set(OUTER_UNROLL_POW2_S10 2) +set(SEED_A10 1) +set(SEED_S10 2) + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) +SET(OUTER_UNROLL ${OUTER_UNROLL_A10}) +SET(INNER_UNROLL ${INNER_UNROLL_A10}) +SET(OUTER_UNROLL_POW2 ${OUTER_UNROLL_POW2_A10}) +SET(SEED ${SEED_A10}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + SET(OUTER_UNROLL ${OUTER_UNROLL_S10}) + SET(INNER_UNROLL ${INNER_UNROLL_S10}) + SET(OUTER_UNROLL_POW2 ${OUTER_UNROLL_POW2_S10}) + SET(SEED ${SEED_S10}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS -fintelfpga -c -DOUTER_UNROLL=${OUTER_UNROLL} -DINNER_UNROLL=${INNER_UNROLL} -DOUTER_UNROLL_POW2=${OUTER_UNROLL_POW2}) + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +separate_arguments(USER_HARDWARE_FLAGS) +set(HARDWARE_LINK_FLAGS -fintelfpga -Xshardware -Xsdaz -Xsrounding=faithful -Xsparallel=2 -Xsseed=${SEED} -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS} -DOUTER_UNROLL=${OUTER_UNROLL} -DINNER_UNROLL=${INNER_UNROLL} -DOUTER_UNROLL_POW2=${OUTER_UNROLL_POW2}) +set(FINAL_LINK_FLAGS -fintelfpga -DOUTER_UNROLL=${OUTER_UNROLL} -DINNER_UNROLL=${INNER_UNROLL} -DOUTER_UNROLL_POW2=${OUTER_UNROLL_POW2}) + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR -DOUTER_UNROLL=${OUTER_UNROLL} -DINNER_UNROLL=${INNER_UNROLL} -DOUTER_UNROLL_POW2=${OUTER_UNROLL_POW2}") +set(EMULATOR_LINK_FLAGS "-fintelfpga") + +#copy input data +configure_file("data/ordered_inputs.csv" "data/ordered_inputs.csv" COPYONLY) + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + +# fpgas +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + set(DEVICE_FPGA_OBJ "crr_fpga.o") + + add_custom_command(OUTPUT ${DEVICE_FPGA_OBJ} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${DEVICE_FPGA_OBJ} + DEPENDS ${SOURCE_FILE}) + + add_custom_command(OUTPUT ${FPGA_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} ${DEVICE_FPGA_OBJ} -o ${CMAKE_BINARY_DIR}/${FPGA_TARGET} + DEPENDS ${DEVICE_FPGA_OBJ}) +endif() + +# fpga report +if(WIN32) + add_custom_target(report DEPENDS ${REPORTS_TARGET} ) + + separate_arguments(WIN_FLAGS WINDOWS_COMMAND) + add_custom_command(OUTPUT ${REPORTS_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${WIN_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET} + DEPENDS ${SOURCE_FILE}) + +else() + add_custom_target(report DEPENDS ${REPORTS_TARGET} ) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/CRR_common.hpp CRR_common.hpp COPYONLY) + + add_custom_command(OUTPUT ${REPORTS_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET} + DEPENDS ${SOURCE_FILE} CRR_common.hpp) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu data/ordered_inputs.csv -o=data/ordered_output.csv + DEPENDS ${TARGET_NAME}.fpga_emu) + diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CRR_common.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CRR_common.hpp new file mode 100755 index 0000000000..6f2537e1e0 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CRR_common.hpp @@ -0,0 +1,149 @@ +// ============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of +// California and by the laws of the United States of America. + +#ifndef __CRR_COMMON_H__ +#define __CRR_COMMON_H__ + +constexpr int kMaxStringLen = 1024; + +// Increments of kMaxNSteps +constexpr size_t kMaxNSteps = 8189; +constexpr size_t kMaxNSteps1 = 8190; +constexpr size_t kMaxNSteps2 = 8191; +constexpr size_t kMaxNSteps3 = 8192; + +// Increment by a small epsilon in order to compute derivative +// of option price with respect to Vol or Interest. The derivatives +// are then used to compute Vega and Rho. +constexpr double kEpsilon = 0.0001; + +// Whenever calculations are made for Option Price 0, need to increment +// nsteps by 2 to ensure all the required derivative prices are calculated. +constexpr size_t kOpt0 = 2; + + +// Solver configuration settings that are dependent on selected +// board. Most notable settings are: + +// OUTER_UNROLL controls the number of CRRs that can be processed +// in parallel in a SIMD fashion (number of CRRS must be >= OUTER_UNROLL). +// This is ideally a power of two, but does not have to be. Since +// the DRAM bandwidth requirement is low, increasing OUTER_UNROLL +// should result in fairly linear speedup. (max: 32 on PAC A10) + +// INNER_UNROLL controls the degree of parallelization within +// the calculation of a single CRR. This must be a power of two. Increasing +// INNER_UNROLL has a lower area overhead than increasing OUTER_UNROLL; +// however, there are diminishing returns as INNER_UNROLL is increased with +// respect to the number of time steps. (max: 128 on PAC A10) + + +// Data structure for original input data. +typedef struct { + int cp; /* cp = -1 or 1 for Put & Call respectively. */ + double n_steps; /* n_steps = number of time steps in the binomial tree. */ + double strike; /* strike = exercise price of option. */ + double spot; /* spot = spot price of the underlying. */ + double fwd; /* fwd = forward price of the underlying. */ + double vol; /* vol = per cent volatility, input as a decimal. */ + double df; /* df = discount factor to option expiry. */ + double t; /* t = time in years to the maturity of the option. */ + +} InputData; + +// Data structure as the inputs to FPGA. +// Element[i] is used to compute option_price[i]. +typedef struct { + double n_steps; /* n_steps = number of time steps in the binomial tree. */ + double u[3]; /* u = the increase factor of a up movement in the binomial tree, + same for each time step. */ + double u2[3]; /* u2 = the square of increase factor. */ + double c1[3]; /* c1 = the probality of a down movement in the binomial tree, + same for each time step. */ + double c2[3]; /* c2 = the probality of a up movement in the binomial tree. */ + double umin[3]; /* umin = minimum price of the underlying at the maturity. */ + double param_1[3];/* param_1[i] = cp * umin[i] */ + double param_2; /* param_2 = cp * strike */ + +} CRRInParams; + +// Data structure as the output from ProcessKernelResult(). +typedef struct { + double pgreek[4]; /* Stores the 4 derivative prices in the binomial tree + required to compute the Premium and Greeks. */ + double vals[3]; /* Three option prices calculated */ + +} InterRes; + +// Data structure for option price and five Greeks. +typedef struct { + double value; /* value = option price. */ + double delta; + double gamma; + double vega; + double theta; + double rho; +} OutputRes; + +// Data structures required by the kernel +typedef struct { + double u; + double c1; + double c2; + double param_1; + double param_2; + short n_steps; + short pad1; + int pad2; + double pad3; + double pad4; +} CRRMeta; + +typedef struct { + double u2; + double p1powu; + double init_optval; + double pad; +} ArrayEle; + +typedef struct { + ArrayEle array_eles[kMaxNSteps3][3]; /* Second dimension size set to 3 to have a + separate ArrayEle for each option price */ +} CRRArrayEles; + +typedef struct { + ArrayEle array_eles[kMaxNSteps3]; +} CRRPerStepMeta; + +typedef struct { + double pgreek[4]; + double optval0; + double pad[3]; +} CRRResParams; + +#endif diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/build.ninja b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/build.ninja new file mode 100755 index 0000000000..58af917f67 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/build.ninja @@ -0,0 +1,35 @@ +source_file = main.cpp +target_name = crr + +emulator_target = ${target_name}.fpga_emu.exe +report_target = ${target_name}_report.a +report_target_s10_pac = ${target_name}_s10_pac_report.a + +hardware_flags = -fintelfpga -Xshardware -Xsfpc -Xsparallel=2 -Xsseed=5 +emulator_flags = -fintelfpga -DFPGA_EMULATOR +a10_flags = -DOUTER_UNROLL=1 -DINNER_UNROLL=64 -DOUTER_UNROLL_POW2=1 +s10_flags = -DOUTER_UNROLL=2 -DINNER_UNROLL=64 -DOUTER_UNROLL_POW2=2 + +rule build_fpga_emu + command = dpcpp /GX ${emulator_flags} ${a10_flags} $in -o $out + +rule build_fpga_emu_s10 + command = dpcpp /GX ${emulator_flags} -Xsboard=intel_s10sx_pac:pac_s10 ${s10_flags} $in -o $out + +rule gen_report + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 ${a10_flags} -fsycl-link $in -o $out + +rule gen_report_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 ${s10_flags} -fsycl-link $in -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu ${source_file} + +# report +build report: phony ${report_target} +build ${report_target}: gen_report ${source_file} + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_s10_pac} +build ${report_target_s10_pac}: gen_report_s10_pac ${source_file} diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/data/ordered_inputs.csv b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/data/ordered_inputs.csv new file mode 100755 index 0000000000..3a28083fa2 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/data/ordered_inputs.csv @@ -0,0 +1,10 @@ +8189,-1,37.5,37.50112053,85,0.4,0.99997012,0.011952191 +8189,1,37.5,37.50112053,85,0.4,0.99997012,0.011952191 +8189,-1,270,270.0080678,65,0.18,0.999940241,0.011952191 +8189,1,270,270.0080678,65,0.18,0.999940241,0.011952191 +8189,-1,292.5,292.5087402,70,0.35,0.999940241,0.011952191 +8189,1,292.5,292.5087402,70,0.35,0.999940241,0.011952191 +8189,-1,122.5,122.5109816,40,0.2,0.999910363,0.011952191 +8189,1,122.5,122.5109816,40,0.2,0.999910363,0.011952191 +8189,-1,22.5,22.50067232,55,0.3,0.999910363,0.011952191 +8189,1,22.5,22.50067232,55,0.3,0.999910363,0.011952191 diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/main.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/main.cpp new file mode 100755 index 0000000000..7c92610e19 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/main.cpp @@ -0,0 +1,849 @@ +// ============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of +// California and by the laws of the United States of America. + +//////////////////////////////////////////////////////////////////////////////// +// +// CRRSolver CPU/FPGA Accelerator Demo Program +// +//////////////////////////////////////////////////////////////////////////////// +// +// This design implments simple Cox-Ross-Rubinstein(CRR) binomial tree model +// with Greeks for American exercise options. +// +// +// Optimization summary: +// -- Area-consuming but infrequent calculation is done on CPU. +// -- Parallelize the calculation of a single CRR. +// -- Run multiple independent CRRs in parallel. +// -- Optimized memory configurations to reduce the need for replication +// and to eliminate the need for double-pumping M20Ks. +// +// The following diagram shows the mechanism of optimizations to CRR. +// +// +// +------+ ^ +// +------------>|optval| | +// | | [2] | | +// | +------+ | +// | | +// | | +// +--+---+ | +// +------------>|optval| | +// | | [1] | | +// | +--+---+ | +// | | | +// | | | +// | | | Loop4(L4) +// | | | updates +// +---+--+ +------------>+------+ | multiple +// |optval| |optval| | elements +// | [0] | | [1] | | in optval[] +// +---+--+ +------------>+------+ | simultaneously +// | | | +// | | | +// | | | +// | | | +// | +--+---+ | +// | |optval| | +// +------------>| [0] | | +// +--+---+ | +// | | +// | | +// | +------+ | +// | |optval| | +// +------------>| [0] | | +// +------+ + +// +// +// +// +// step 1 step 2 +// +// +// <------------------------------------------+ +// Loop3(L3) updates each level of the tree +// +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "CRR_common.hpp" +#include "dpc_common.hpp" + +using namespace std; +using namespace sycl; + +class CRRSolver; +double CrrSolver(const int n_items, vector &in_params, + vector &res_params, + vector &in_params2, queue &q) { + dpc_common::TimeInterval timer; + + constexpr int steps = kMaxNSteps2; + + const int n_crr = + (((n_items + (OUTER_UNROLL - 1)) / OUTER_UNROLL) * OUTER_UNROLL) * 3; + + { + buffer i_params(in_params.data(), in_params.size()); + buffer r_params(res_params.data(), res_params.size()); + buffer a_params(in_params2.data(), in_params2.size()); + + event e; + { + e = q.submit([&](handler &h) { + auto accessor_v = + i_params.template get_access(h); + + auto accessor_v2 = + a_params.template get_access(h); + + auto accessor_r = + r_params.template get_access(h); + + h.single_task([=]() [[intel::kernel_args_restrict]] { + // Kernel requires n_crr to be a multiple of OUTER_UNROLL. + // This is taken care of by the host. + const int n_crr_div = n_crr / OUTER_UNROLL; + + // Outerloop counter. Use while-loop for better timing-closure + // characteristics because it tells the compiler the loop body will + // never be skipped. + int oc = 0; + do { + // Metadata of CRR problems + [[intelfpga::register]] double u[OUTER_UNROLL]; + [[intelfpga::register]] double c1[OUTER_UNROLL]; + [[intelfpga::register]] double c2[OUTER_UNROLL]; + [[intelfpga::register]] double param_1[OUTER_UNROLL]; + [[intelfpga::register]] double param_2[OUTER_UNROLL]; + [[intelfpga::register]] short n_steps[OUTER_UNROLL]; + + // Current values in binomial tree. We only need to keep track of + // one level worth of data, not the entire tree. + [[intelfpga::memory, intelfpga::singlepump, + intelfpga::bankwidth(sizeof(double)), + intelfpga::numbanks(INNER_UNROLL * OUTER_UNROLL_POW2), + intelfpga::private_copies( + 8)]] double optval[kMaxNSteps3][OUTER_UNROLL_POW2]; + + // Initial values in binomial tree, which correspond to the last + // level of the binomial tree. + [[intelfpga::memory, intelfpga::singlepump, + intelfpga::bankwidth(sizeof(double)), + intelfpga::numbanks(INNER_UNROLL * OUTER_UNROLL_POW2), + intelfpga::private_copies( + 8)]] double init_optval[kMaxNSteps3][OUTER_UNROLL_POW2]; + + // u2_array precalculates the power function of u2. + [[intelfpga::memory, intelfpga::singlepump, + intelfpga::bankwidth(sizeof(double)), + intelfpga::numbanks(INNER_UNROLL * OUTER_UNROLL_POW2), + intelfpga::private_copies( + 8)]] double u2_array[kMaxNSteps3][OUTER_UNROLL_POW2]; + + // p1powu_array precalculates p1 multipy the power of u. + [[intelfpga::memory, intelfpga::singlepump, + intelfpga::bankwidth(sizeof(double)), + intelfpga::numbanks(INNER_UNROLL * OUTER_UNROLL_POW2), + intelfpga::private_copies( + 8)]] double p1powu_array[kMaxNSteps3][OUTER_UNROLL_POW2]; + + // n0_optval stores the binomial tree value corresponding to node 0 + // of a level. This is the same as what's stored in + // optval/init_optval, but replicating this data allows us to have + // only one read port for optval and init_optval, thereby removing + // the need of double-pumping or replication. n0_optval_2 is a copy + // of n0_optval that stores the node 0 value for a specific layer of + // the tree. pgreek is the array saving values for post-calculating + // Greeks. + [[intelfpga::register]] double n0_optval[OUTER_UNROLL]; + [[intelfpga::register]] double n0_optval_2[OUTER_UNROLL]; + [[intelfpga::register]] double pgreek[4][OUTER_UNROLL]; + + // L1 + L2: + // Populate init_optval -- calculate the last level of the binomial + // tree. + for (short ic = 0; ic < OUTER_UNROLL; ++ic) { + // Transfer data from DRAM to local memory or registers + const int c = oc * OUTER_UNROLL + ic; + const CRRMeta param = accessor_v[c]; + + u[ic] = param.u; + c1[ic] = param.c1; + c2[ic] = param.c2; + param_1[ic] = param.param_1; + param_2[ic] = param.param_2; + n_steps[ic] = param.n_steps; + + for (short t = steps; t >= 0; --t) { + const ArrayEle param_array = accessor_v2[c].array_eles[t]; + + const double init_val = param_array.init_optval; + + init_optval[t][ic] = init_val; + + // n0_optval intends to store the node value at t == 0. + // Instead of qualifying this statement by an "if (t == 0)", + // which couples the loop counter to the timing path of the + // assignment, we reverse the loop direction so the last value + // stored corresponds to t == 0. + n0_optval[ic] = init_val; + + // Transfer data from DRAM to local memory or registers + u2_array[t][ic] = param_array.u2; + p1powu_array[t][ic] = param_array.p1powu; + } + } + + // L3: + // Update optval[] -- calculate each level of the binomial tree. + // reg[] helps to achieve updating INNER_UNROLL elements in optval[] + // simultaneously. + [[intelfpga::disable_loop_pipelining]] for (short t = 0; + t <= steps - 1; ++t) { + [[intelfpga::register]] double reg[INNER_UNROLL + 1][OUTER_UNROLL]; + + double val_1, val_2; + + #pragma unroll + for (short ic = 0; ic < OUTER_UNROLL; ++ic) { + reg[0][ic] = n0_optval[ic]; + } + + // L4: + // Calculate all the elements in optval[] -- all the tree nodes + // for one level of the tree + [[intelfpga::ivdep]] for (int n = 0; n <= steps - 1 - t; + n += INNER_UNROLL) { + + #pragma unroll + for (short ic = 0; ic < OUTER_UNROLL; ++ic) { + + #pragma unroll + for (short ri = 1; ri <= INNER_UNROLL; ++ri) { + reg[ri][ic] = + (t == 0) ? init_optval[n + ri][ic] : optval[n + ri][ic]; + } + + #pragma unroll + for (short ri = 0; ri < INNER_UNROLL; ++ri) { + const double val = sycl::fmax( + c1[ic] * reg[ri][ic] + c2[ic] * reg[ri + 1][ic], + p1powu_array[t][ic] * u2_array[n + ri][ic] - + param_2[ic]); + + optval[n + ri][ic] = val; + if (n + ri == 0) { + n0_optval[ic] = val; + } + if (n + ri == 1) { + val_1 = val; + } + if (n + ri == 2) { + val_2 = val; + } + } + + reg[0][ic] = reg[INNER_UNROLL][ic]; + + if (t == steps - 5) { + pgreek[3][ic] = val_2; + } + if (t == steps - 3) { + pgreek[0][ic] = n0_optval[ic]; + pgreek[1][ic] = val_1; + pgreek[2][ic] = val_2; + n0_optval_2[ic] = n0_optval[ic]; + } + } + } + } + + // L5: transfer crr_res_paramss to DRAM + #pragma unroll + for (short ic = 0; ic < OUTER_UNROLL; ++ic) { + const int c = oc * OUTER_UNROLL + ic; + if (n_steps[ic] < steps) { + accessor_r[c].optval0 = n0_optval_2[ic]; + } else { + accessor_r[c].optval0 = n0_optval[ic]; + } + accessor_r[c].pgreek[0] = pgreek[0][ic]; + accessor_r[c].pgreek[1] = pgreek[1][ic]; + accessor_r[c].pgreek[2] = pgreek[2][ic]; + accessor_r[c].pgreek[3] = pgreek[3][ic]; + } + // Increment counters + oc += 1; + } while (oc < n_crr_div); + }); + }); + } + } + + double diff = timer.Elapsed(); + return diff; +} + +void ReadInputFromFile(ifstream &input_file, vector &inp) { + string line_of_args; + while (getline(input_file, line_of_args)) { + InputData temp; + istringstream line_of_args_ss(line_of_args); + line_of_args_ss >> temp.n_steps; + line_of_args_ss.ignore(1, ','); + line_of_args_ss >> temp.cp; + line_of_args_ss.ignore(1, ','); + line_of_args_ss >> temp.spot; + line_of_args_ss.ignore(1, ','); + line_of_args_ss >> temp.fwd; + line_of_args_ss.ignore(1, ','); + line_of_args_ss >> temp.strike; + line_of_args_ss.ignore(1, ','); + line_of_args_ss >> temp.vol; + line_of_args_ss.ignore(1, ','); + line_of_args_ss >> temp.df; + line_of_args_ss.ignore(1, ','); + line_of_args_ss >> temp.t; + + inp.push_back(temp); + } +} + +static string ToStringWithPrecision(const double value, const int p = 6) { + ostringstream out; + out.precision(p); + out << std::fixed << value; + return out.str(); +} + +void WriteOutputToFile(ofstream &output_file, const vector &outp) { + size_t n = outp.size(); + for (size_t i = 0; i < n; ++i) { + OutputRes temp; + temp = outp[i]; + string line = ToStringWithPrecision(temp.value, 12) + " " + + ToStringWithPrecision(temp.delta, 12) + " " + + ToStringWithPrecision(temp.gamma, 12) + " " + + ToStringWithPrecision(temp.vega, 12) + " " + + ToStringWithPrecision(temp.theta, 12) + " " + + ToStringWithPrecision(temp.rho, 12) + "\n"; + + output_file << line; + } +} + +bool FindGetArgString(const string &arg, const char *str, char *str_value, + size_t maxchars) { + size_t found = arg.find(str, 0, strlen(str)); + if (found != string::npos) { + const char *sptr = &arg.c_str()[strlen(str)]; + for (int i = 0; i < maxchars - 1; i++) { + char ch = sptr[i]; + switch (ch) { + case ' ': + case '\t': + case '\0': + str_value[i] = 0; + return true; + break; + default: + str_value[i] = ch; + break; + } + } + return true; + } + return false; +} + +// Perform data pre-processing work +// Three different option prices are required to solve each CRR problem +// The following lists why each option price is required: +// [0] : Used to compute Premium, Delta, Gamma and Theta +// [1] : Used to compute Rho +// [2] : Used to compute Vega +CRRInParams PrepareData(const InputData &inp) { + CRRInParams in_params; + in_params.n_steps = inp.n_steps; + + double r[2]; + r[0] = pow(inp.df, 1.0 / inp.n_steps); + double d_df = exp(-inp.t * kEpsilon); + r[1] = pow(inp.df * d_df, 1.0 / inp.n_steps); + in_params.u[0] = exp(inp.vol * sqrt(inp.t / inp.n_steps)); + in_params.u[1] = in_params.u[0]; + in_params.u[2] = exp((inp.vol + kEpsilon) * sqrt(inp.t / inp.n_steps)); + + in_params.u2[0] = in_params.u[0] * in_params.u[0]; + in_params.u2[1] = in_params.u[1] * in_params.u[1]; + in_params.u2[2] = in_params.u[2] * in_params.u[2]; + in_params.umin[0] = inp.spot * pow(1 / in_params.u[0], inp.n_steps + kOpt0); + in_params.umin[1] = inp.spot * pow(1 / in_params.u[1], inp.n_steps); + in_params.umin[2] = inp.spot * pow(1 / in_params.u[2], inp.n_steps); + in_params.c1[0] = + r[0] * (in_params.u[0] - pow(inp.fwd / inp.spot, 1.0 / inp.n_steps)) / + (in_params.u[0] - 1 / in_params.u[0]); + in_params.c1[1] = + r[1] *(in_params.u[1] - pow((inp.fwd / d_df) / inp.spot, 1.0 / inp.n_steps)) / + (in_params.u[1] - 1 / in_params.u[1]); + in_params.c1[2] = + r[0] * (in_params.u[2] - pow(inp.fwd / inp.spot, 1.0 / inp.n_steps)) / + (in_params.u[2] - 1 / in_params.u[2]); + in_params.c2[0] = r[0] - in_params.c1[0]; + in_params.c2[1] = r[1] - in_params.c1[1]; + in_params.c2[2] = r[0] - in_params.c1[2]; + + in_params.param_1[0] = inp.cp * in_params.umin[0]; + in_params.param_1[1] = inp.cp * in_params.umin[1]; + in_params.param_1[2] = inp.cp * in_params.umin[2]; + in_params.param_2 = inp.cp * inp.strike; + + return in_params; +} + +CRRArrayEles PrepareArrData(const CRRInParams &in) { + CRRArrayEles arr; + + // Write in reverse t-direction to match kernel access pattern + for (int i = 0; i <= in.n_steps + kOpt0; ++i) { + for (int inner_func_index = 0; inner_func_index < 3; ++inner_func_index) { + arr.array_eles[i][inner_func_index].u2 = pow(in.u2[inner_func_index], i); + arr.array_eles[i][inner_func_index].p1powu = + in.param_1[inner_func_index] * pow(in.u[inner_func_index], i + 1); + arr.array_eles[i][inner_func_index].init_optval = + fmax(in.param_1[inner_func_index] * pow(in.u2[inner_func_index], i) - + in.param_2, 0.0); + } + } + + return arr; +} + +// Metadata, used in the Kernel, is generated from the input data +// Each CRR problem is split into 3 subproblems to calculate +// each required option price separately +void PrepareKernelData(vector &in_params, + vector &array_params, + vector &in_buff_params, + vector &in_buff2_params, + const int n_crrs) { + + constexpr short offset = 0; + + for (int wi_idx = offset, dst = offset * 3; wi_idx < n_crrs; ++wi_idx) { + CRRInParams &src_crr_params = in_params[wi_idx]; + + CRRArrayEles &src_crr_eles = array_params[wi_idx]; + + for (int inner_func_index = 0; inner_func_index < 3; + ++inner_func_index, ++dst) { + CRRMeta &dst_crr_meta = in_buff_params[dst]; + CRRPerStepMeta &dst_crr_per_step_meta = in_buff2_params[dst]; + + dst_crr_meta.u = src_crr_params.u[inner_func_index]; + dst_crr_meta.c1 = src_crr_params.c1[inner_func_index]; + dst_crr_meta.c2 = src_crr_params.c2[inner_func_index]; + + dst_crr_meta.param_1 = src_crr_params.param_1[inner_func_index]; + dst_crr_meta.param_2 = src_crr_params.param_2; + + if (inner_func_index == 0) { + dst_crr_meta.n_steps = src_crr_params.n_steps + kOpt0; + } else { + dst_crr_meta.n_steps = src_crr_params.n_steps; + } + for (int i = 0; i <= kMaxNSteps2; ++i) { + dst_crr_per_step_meta.array_eles[i].u2 = + src_crr_eles.array_eles[i][inner_func_index].u2; + dst_crr_per_step_meta.array_eles[i].p1powu = + src_crr_eles.array_eles[i][inner_func_index].p1powu; + dst_crr_per_step_meta.array_eles[i].init_optval = + src_crr_eles.array_eles[i][inner_func_index].init_optval; + } + } + } +} + +// Takes in the result from the kernel and stores the 3 option prices +// belonging to the same CRR problem in one InterRes element +void ProcessKernelResult(const vector &res_params, + vector &postp_buff, const int n_crrs) { + constexpr int offset = 0; + + for (int wi_idx = offset, src = offset * 3; wi_idx < n_crrs; ++wi_idx) { + InterRes &dst_res = postp_buff[wi_idx]; + + for (int inner_func_index = 0; inner_func_index < 3; + ++inner_func_index, ++src) { + const CRRResParams &src_res = res_params[src]; + + for (int i = 0; i < 4; ++i) { + if (inner_func_index == 0) { + dst_res.pgreek[i] = src_res.pgreek[i]; + } + } + + dst_res.vals[inner_func_index] = src_res.optval0; + } + } +} + +// Computes the Premium and Greeks +OutputRes ComputeOutput(const InputData &inp, const CRRInParams &in_params, + const InterRes &res_params) { + double h; + OutputRes res; + h = inp.spot * (in_params.u2[0] - 1 / in_params.u2[0]); + res.value = res_params.pgreek[1]; + res.delta = (res_params.pgreek[2] - res_params.pgreek[0]) / h; + res.gamma = 2 / h * + ((res_params.pgreek[2] - res_params.pgreek[1]) / inp.spot / + (in_params.u2[0] - 1) - + (res_params.pgreek[1] - res_params.pgreek[0]) / inp.spot / + (1 - (1 / in_params.u2[0]))); + res.theta = + (res_params.vals[0] - res_params.pgreek[3]) / 4 / inp.t * inp.n_steps; + res.rho = (res_params.vals[1] - res.value) / kEpsilon; + res.vega = (res_params.vals[2] - res.value) / kEpsilon; + return res; +} + +// Perform CRR solving using the CPU and compare FPGA resutls with CPU results +// to test correctness. +void TestCorrectness(int k, int n_crrs, bool &pass, const InputData &inp, + CRRInParams &vals, const OutputRes &fpga_res) { + if (k == 0) { + std::cout << "\n============= Correctness Test ============= \n"; + std::cout << "Running analytical correctness checks... \n"; + } + + // This CRR benchmark ensures a minimum 4 decimal points match between FPGA and CPU + // "threshold" is chosen to enforce this guarantee + float threshold = 0.00001; + int i, j, q; + double x; + int n_steps = vals.n_steps; + int m = n_steps + kOpt0; + vector pvalue(kMaxNSteps3); + vector pvalue_1(kMaxNSteps1); + vector pvalue_2(kMaxNSteps1); + vector pgreek(5); + InterRes cpu_res_params; + OutputRes cpu_res; + + // option value computed at each final node + x = vals.umin[0]; + for (i = 0; i <= m; i++, x *= vals.u2[0]) { + pvalue[i] = fmax(inp.cp * (x - inp.strike), 0.0); + } + + // backward recursion to evaluate option price + for (i = m - 1; i >= 0; i--) { + vals.umin[0] *= vals.u[0]; + x = vals.umin[0]; + for (j = 0; j <= i; j++, x *= vals.u2[0]) { + pvalue[j] = fmax(vals.c1[0] * pvalue[j] + vals.c2[0] * pvalue[j + 1], + inp.cp * (x - inp.strike)); + } + if (i == 4) { + pgreek[4] = pvalue[2]; + } + if (i == 2) { + for (q = 0; q <= 2; q++) { + pgreek[q + 1] = pvalue[q]; + } + } + } + cpu_res_params.vals[0] = pvalue[0]; + + // the above computation is repeated for each option price + x = vals.umin[1]; + for (i = 0; i <= n_steps; i++, x *= vals.u2[1]) { + pvalue_1[i] = fmax(inp.cp * (x - inp.strike), 0.0); + } + + for (i = n_steps - 1; i >= 0; i--) { + vals.umin[1] *= vals.u[1]; + x = vals.umin[1]; + + for (j = 0; j <= i; j++, x *= vals.u2[1]) { + pvalue_1[j] = + fmax(vals.c1[1] * pvalue_1[j] + vals.c2[1] * pvalue_1[j + 1], + inp.cp * (x - inp.strike)); + } + } + cpu_res_params.vals[1] = pvalue_1[0]; + + x = vals.umin[2]; + for (i = 0; i <= n_steps; i++, x *= vals.u2[2]) { + pvalue_2[i] = fmax(inp.cp * (x - inp.strike), 0.0); + } + + for (i = n_steps - 1; i >= 0; i--) { + vals.umin[2] *= vals.u[2]; + x = vals.umin[2]; + for (j = 0; j <= i; j++, x *= vals.u2[2]) { + pvalue_2[j] = + fmax(vals.c1[2] * pvalue_2[j] + vals.c2[2] * pvalue_2[j + 1], + inp.cp * (x - inp.strike)); + } + } + cpu_res_params.vals[2] = pvalue_2[0]; + pgreek[0] = 0; + + for (i = 1; i < 5; ++i) { + cpu_res_params.pgreek[i - 1] = pgreek[i]; + } + + cpu_res = ComputeOutput(inp, vals, cpu_res_params); + + if (abs(cpu_res.value - fpga_res.value) > threshold) { + pass = false; + std::cout << "fpga_res.value " << k << " = " << std::fixed + << std::setprecision(20) << fpga_res.value << "\n"; + std::cout << "cpu_res.value " << k << " = " << std::fixed + << std::setprecision(20) << cpu_res.value << "\n"; + std::cout << "Mismatch detected for value of crr " << k << "\n"; + } + if (abs(cpu_res.delta - fpga_res.delta) > threshold) { + pass = false; + std::cout << "fpga_res.delta " << k << " = " << std::fixed + << std::setprecision(20) << fpga_res.delta << "\n"; + std::cout << "cpu_res.delta " << k << " = " << std::fixed + << std::setprecision(20) << cpu_res.delta << "\n"; + std::cout << "Mismatch detected for value of crr " << k << "\n"; + } + if (abs(cpu_res.gamma - fpga_res.gamma) > threshold) { + pass = false; + std::cout << "fpga_res.gamma " << k << " = " << std::fixed + << std::setprecision(20) << fpga_res.gamma << "\n"; + std::cout << "cpu_res.gamma " << k << " = " << std::fixed + << std::setprecision(20) << cpu_res.gamma << "\n"; + std::cout << "Mismatch detected for value of crr " << k << "\n"; + } + if (abs(cpu_res.vega - fpga_res.vega) > threshold) { + pass = false; + std::cout << "fpga_res.vega " << k << " = " << std::fixed + << std::setprecision(20) << fpga_res.vega << "\n"; + std::cout << "cpu_res.vega " << k << " = " << std::fixed + << std::setprecision(20) << cpu_res.vega << "\n"; + std::cout << "Mismatch detected for value of crr " << k << "\n"; + } + if (abs(cpu_res.theta - fpga_res.theta) > threshold) { + pass = false; + std::cout << "fpga_res.theta " << k << " = " << std::fixed + << std::setprecision(20) << fpga_res.theta << "\n"; + std::cout << "cpu_res.theta " << k << " = " << std::fixed + << std::setprecision(20) << cpu_res.theta << "\n"; + std::cout << "Mismatch detected for value of crr " << k << "\n"; + } + if (abs(cpu_res.rho - fpga_res.rho) > threshold) { + pass = false; + std::cout << "fpga_res.rho " << k << " = " << std::fixed + << std::setprecision(20) << fpga_res.rho << "\n"; + std::cout << "cpu_res.rho " << k << " = " << std::fixed + << std::setprecision(20) << cpu_res.rho << "\n"; + std::cout << "Mismatch detected for value of crr " << k << "\n"; + } + + if (k == n_crrs - 1) { + std::cout << "CPU-FPGA Equivalence: " << (pass ? "PASS" : "FAIL") << "\n"; + } +} + +// Print out the achieved CRR throughput +void TestThroughput(const double &time, const int &n_crrs) { + std::cout << "\n============= Throughput Test =============\n"; + + std::cout << " Avg throughput: " << std::fixed << std::setprecision(1) + << (n_crrs / time) << " assets/s\n"; +} + +int main(int argc, char *argv[]) { + string infilename = ""; + string outfilename = ""; + + const string default_ifile = "src/data/ordered_inputs.csv"; + const string default_ofile = "src/data/ordered_outputs.csv"; + + char str_buffer[kMaxStringLen] = {0}; + for (int i = 1; i < argc; i++) { + if (argv[i][0] == '-') { + string sarg(argv[i]); + + FindGetArgString(sarg, "-o=", str_buffer, kMaxStringLen); + FindGetArgString(sarg, "--output-file=", str_buffer, kMaxStringLen); + } else { + infilename = string(argv[i]); + } + } + + try { +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector device_selector; +#else + intel::fpga_selector device_selector; +#endif + + queue q(device_selector, dpc_common::exception_handler); + + std::cout << "Running on device: " + << q.get_device().get_info().c_str() << "\n"; + + device device = q.get_device(); + std::cout << "Device name: " + << device.get_info().c_str() << "\n \n \n"; + + vector inp; + + // Get input file name, if users don't have their test input file, this + // design will use the default input file + if (infilename == "") { + infilename = default_ifile; + } + ifstream inputFile(infilename); + + if (!inputFile.is_open()) { + std::cerr << "Input file doesn't exist \n"; + return 1; + } + + // Check input file format + string filename = infilename; + std::size_t found = filename.find_last_of("."); + if (!(filename.substr(found + 1).compare("csv") == 0)) { + std::cerr << "Input file format only support .csv\n"; + return 1; + } + + // Get output file name, if users don't define output file name, the design + // will use the default output file + outfilename = default_ofile; + if (strlen(str_buffer)) { + outfilename = string(str_buffer); + } + + // Check output file format + filename = outfilename; + found = filename.find_last_of("."); + if (!(filename.substr(found + 1).compare("csv") == 0)) { + std::cerr << "Output file format only support .csv\n"; + return 1; + } + + // Read inputs data from input file + ReadInputFromFile(inputFile, inp); + +// Get the number of data from the input file +// Emulator mode only goes through one input (or through OUTER_UNROLL inputs) to +// ensure fast runtime +#if defined(FPGA_EMULATOR) + int temp_crrs = 1; +#else + int temp_crrs = inp.size(); +#endif + + // Check if n_crrs >= OUTER_UNROLL + if (OUTER_UNROLL >= temp_crrs) { + if (inp.size() < OUTER_UNROLL) { + std::cerr << "Input size must be greater than or equal to OUTER_UNROLL\n"; + return 1; + } else { + temp_crrs = OUTER_UNROLL; + } + } + + const int n_crrs = temp_crrs; + + vector in_params(n_crrs); + vector array_params(n_crrs); + + for (int j = 0; j < n_crrs; ++j) { + in_params[j] = PrepareData(inp[j]); + array_params[j] = PrepareArrData(in_params[j]); + } + + // following vectors are arguments for CrrSolver + vector in_buff_params(n_crrs * 3); + vector in_buff2_params(n_crrs * 3); + + vector res_params(n_crrs * 3); + vector res_params_dummy(n_crrs * 3); + + // Prepare metadata as input to kernel + PrepareKernelData(in_params, array_params, in_buff_params, in_buff2_params, + n_crrs); + + // warmup run - use this run to warmup accelerator + CrrSolver(n_crrs, in_buff_params, res_params_dummy, in_buff2_params, + q); + // Timed run - profile performance + double time = CrrSolver(n_crrs, in_buff_params, res_params, + in_buff2_params, q); + bool pass = true; + + // Postprocessing step + // process_res used to compute final results + vector process_res(n_crrs); + ProcessKernelResult(res_params, process_res, n_crrs); + + vector result(n_crrs); + for (int i = 0; i < n_crrs; ++i) { + result[i] = ComputeOutput(inp[i], in_params[i], process_res[i]); + TestCorrectness(i, n_crrs, pass, inp[i], in_params[i], result[i]); + } + + // Write outputs data to output file + ofstream outputFile(outfilename); + + WriteOutputToFile(outputFile, result); + + TestThroughput(time, n_crrs); + + } catch (sycl::exception const &e) { + std::cout << "Caught a synchronous SYCL exception: " << e.what() << "\n"; + std::cout << " If you are targeting an FPGA hardware, " + "ensure that your system is plugged to an FPGA board that is " + "set up correctly\n"; + std::cout << " If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR\n"; + return 1; + } + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/CMakeLists.txt new file mode 100755 index 0000000000..9ac77b0aff --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(GZip) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/README.md b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/README.md new file mode 100755 index 0000000000..18117a82a5 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/README.md @@ -0,0 +1,201 @@ +# GZIP Compression +Reference design demonstrating high-performance GZIP compression on FPGA. + +***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. Additional reference material specific to this GZIP implementation is provided in the References section of this README. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | How to implement a high performance multi-engine compression algorithm on FPGA +| Time to complete | 1 hr (not including compile time) + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + +**Performance** +Please refer to performance disclaimer at the end of this README. + +| Device | Throughput +|:--- |:--- +| Intel® PAC with Intel Arria® 10 GX FPGA | 1 engine @ 3.4 GB/s +| Intel® PAC with Intel Stratix® 10 SX FPGA | 2 engines @ 5.5 GB/s each = 11.0 GB/s total + + +## Purpose + +This DPC++ reference design implements a compression algorithm. The implementation is optimized for the FPGA device. The compression result is GZIP-compatible and can be decompressed with GUNZIP. The GZIP output file format is compatible with GZIP's DEFLATE algorithm, and follows a fixed subset of [RFC 1951](https://www.ietf.org/rfc/rfc1951.txt). See the References section for more specific references. + +The algorithm uses a GZIP-compatible Limpel-Ziv 77 (LZ77) algorithm for data de-duplication, and a GZIP-compatible Static Huffman algorithm for bit reduction. The implementation includes three FPGA accelerated tasks (LZ77, Static Huffman and CRC). + +The FPGA implementation of the algorithm enables either one or two independent GZIP compute engines to operate in parallel on the FPGA. The number of engines is constrained by the available FPGA resources. By default, the design is parameterized to create a single engine when the design is compiled targeting Intel® PAC with Intel Arria® 10 GX FPGA. Two engines are created when targeting Intel® PAC with Intel Stratix® 10 SX FPGA, a larger device. + +## Key Implementation Details + + | Kernel | Description +--- |--- +| LZ Reduction | Implements a LZ77 algorithm for data de-duplication. The algorithm produces distance and length information that is compatible with GZIP's DEFLATE implementation. +| Static Huffman | Uses the same Static Huffman codes used by GZIP's DEFLATE algorithm when it chooses a Static Huffman coding scheme for bit reduction. This choice maintains compatibility with GUNZIP. +| CRC | Adds a CRC checksum based on the input file; this is required by the gzip file format + +To optimize performance, GZIP leverages techniques discussed in the following FPGA tutorials: +* **Double Buffering to Overlap Kernel Execution with Buffer Transfers and Host Processing** (double_buffering) +* **On-Chip Memory Attributes** (mem_config) + + +## License +This code sample is licensed under MIT license. + + +## Building the `gzip` Reference Design + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 24h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + +### On a Windows* System +Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. + +1. Enter the source file directory. + ``` + cd src + ``` + +2. Compile the design. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + ninja fpga_emu + ``` + + * Generate the optimization report: + + ``` + ninja report + ``` + If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use: + ``` + ninja report_s10_pac + ``` + * Compiling for FPGA hardware is not yet supported on Windows. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + + +## Running the Reference Design + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./gzip.fpga_emu [-o=] (Linux) + gzip.fpga_emu.exe [-o=] (Windows) + ``` +2. Run the sample on the FPGA device: + ``` + ./gzip.fpga [-o=] (Linux) + ``` + ### Application Parameters + +| Argument | Description +--- |--- +| `` | Mandatory argument that specifies the file to be compressed. Use a 120+ MB file to achieve peak performance. +| `-o=` | Optional argument that specifies the name of the output file. The default name of the output file is `.gz`. When targeting Intel Stratix® 10 SX, the single `` is fed to both engines, yielding two identical output files, using `` as the basis for the filenames. + +### Example of Output + +``` +Running on device: pac_a10 : Intel PAC Platform (pac_ee00000) +Throughput: 3.4321 GB/s +Compression Ratio 33.2737% +PASSED +``` +## Additional Design Information +### Source Code Explanation + +| File | Description +--- |--- +| `gzip.cpp` | Contains the `main()` function and the top-level interfaces to the SYCL* GZIP functions. +| `gzipkernel.cpp` | Contains the SYCL* kernels used to implement GZIP. +| `CompareGzip.cpp` | Contains code to compare a GZIP-compatible file with the original input. +| `WriteGzip.cpp` | Contains code to write a GZIP compatible file. +| `crc32.cpp` | Contains code to calculate a 32-bit CRC that is compatible with the GZIP file format and to combine multiple 32-bit CRC values. It is used to account only for the CRC of the last few bytes in the file, which are not processed by the accelerated CRC kernel. +| `kernels.hpp` | Contains miscellaneous defines and structure definitions required by the LZReduction and Static Huffman kernels. +| `crc32.hpp` | Header file for `crc32.cpp`. +| `gzipkernel.hpp` | Header file for `gzipkernels.cpp`. +| `CompareGzip.hpp` | Header file for `CompareGzip.cpp`. +| `WriteGzip.hpp` | Header file for `WriteGzip.cpp`. + +### Compiler Flags Used + +| Flag | Description +--- |--- +`-Xshardware` | Target FPGA hardware (as opposed to FPGA emulator) +`-Xsparallel=2` | Uses 2 cores when compiling the bitstream through Quartus +`-Xsseed=1` | Uses seed 1 during Quartus, yields slightly higher fmax +`-Xsnum-reorder=6` | On Intel Stratix® 10 SX only, specify a wider data path for read data from global memory +`-DNUM_ENGINES=<1|2>` | Specifies that 1 GZIP engine should be compiled when targeting Arria® 10 GX and 2 engines when targeting Intel Stratix® 10 SX + + +### Performance disclaimers + +Tests document performance of components on a particular test, in specific systems. Differences in hardware, software, or configuration will affect actual performance. Consult other sources of information to evaluate performance as you consider your purchase. For more complete information about performance and benchmark results, visit [www.intel.com/benchmarks](www.intel.com/benchmarks). + +Performance results are based on testing as of July 29, 2020 and may not reflect all publicly available security updates. See configuration disclosure for details. No product or component can be absolutely secure. + +Intel technologies’ features and benefits depend on system configuration and may require enabled hardware, software or service activation. Performance varies depending on system configuration. Check with your system manufacturer or retailer or learn more at [intel.com](www.intel.com). + +The performance was measured by Intel on July 29, 2020 + +Intel and the Intel logo are trademarks of Intel Corporation or its subsidiaries in the U.S. and/or other countries. + +(C) Intel Corporation. + +### References +[Khronous SYCL Resources](https://www.khronos.org/sycl/resources) + +[Intel GZIP OpenCL Design Example](https://www.intel.com/content/www/us/en/programmable/support/support-resources/design-examples/design-software/opencl/gzip-compression.html) + +[RFC 1951 - DEFLATE Data Format](https://www.ietf.org/rfc/rfc1951.txt) + +[RFC 1952 - GZIP Specification 4.3](https://www.ietf.org/rfc/rfc1952.txt) + +[OpenCL Intercept Layer](https://github.com/intel/opencl-intercept-layer) + diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/Zlib_License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/Zlib_License.txt new file mode 100755 index 0000000000..a75dd96a90 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/Zlib_License.txt @@ -0,0 +1,25 @@ +zlib License + + zlib.h -- interface of the 'zlib' general purpose compression library + version 1.2.11, January 15th, 2017 + + Copyright (C) 1995-2017 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.sln b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.sln new file mode 100755 index 0000000000..580f35f08b --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gzip", "gzip.vcxproj", "{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Debug|x64.ActiveCfg = Debug|x64 + {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Debug|x64.Build.0 = Debug|x64 + {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Release|x64.ActiveCfg = Release|x64 + {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {92BEFAAB-0365-4E5A-9C4A-E50AB49B2A6B} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj new file mode 100755 index 0000000000..cf6a2462d2 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj @@ -0,0 +1,174 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + + + + + + + + + + + + 15.0 + {cf6a576b-665d-4f24-bb62-0dae7a7b3c64} + Win32Proj + gzip + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + + + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + + + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj.user b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj.user new file mode 100755 index 0000000000..1956841792 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj.user @@ -0,0 +1,14 @@ + + + + false + + + src/gzip.cpp -o=test.gz + WindowsLocalDebugger + + + src/gzip.cpp -o=test.gz + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/sample.json b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/sample.json new file mode 100755 index 0000000000..a6d65ecd17 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/sample.json @@ -0,0 +1,51 @@ +{ + "guid": "D55081EB-669D-4832-BCE6-23EE2ACA9F0F", + "name": "GZIP Compression", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Reference Designs"], + "description": "Reference design demonstrating high-performance GZIP compression on FPGA", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "builder": ["ide", "cmake"], + "targetDevice": ["FPGA"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./gzip.fpga_emu ../src/gzip.cpp -o=test.gz" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "gzip.fpga_emu.exe ../src/gzip.cpp -o=test.gz" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CMakeLists.txt new file mode 100755 index 0000000000..bf6125045f --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CMakeLists.txt @@ -0,0 +1,125 @@ +set(DEVICE_SOURCE_FILE gzipkernel.cpp) +set(DEVICE_HEADER_FILE gzipkernel.hpp) +set(HOST_SOURCE_FILE gzip.cpp crc32.cpp WriteGzip.cpp CompareGzip.cpp) + +set(TARGET_NAME gzip) +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) +set(REPORTS_TARGET ${TARGET_NAME}_report) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Design specific constant values + +# To increase NUM_ENGINES to greater than 2, must also statically declare more engines in gzipkernel.cpp --> SubmitGzipTasks() +set(NUM_ENGINES_A10 1) +set(NUM_ENGINES_S10 2) +set(NUM_REORDER "") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) +SET(NUM_ENGINES ${NUM_ENGINES_A10}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + SET(NUM_ENGINES ${NUM_ENGINES_S10}) + set(NUM_REORDER "-Xsnum-reorder=6") + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +#specify -MMD -fsycl-link-targets=... instead of -fintelfpga to workaround known issue; lower report quality +set(HARDWARE_COMPILE_FLAGS -MMD -fsycl-link-targets=spir64_fpga-unknown-unknown-sycldevice -c -DNUM_ENGINES=${NUM_ENGINES}) + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +separate_arguments(USER_HARDWARE_FLAGS) +set(HARDWARE_LINK_FLAGS -fintelfpga -Xshardware -Xsparallel=2 -Xsseed=1 ${NUM_REORDER} -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS} -DNUM_ENGINES=${NUM_ENGINES}) +set(FINAL_LINK_FLAGS -fintelfpga -DNUM_ENGINES=${NUM_ENGINES}) + +set(EMULATOR_COMPILE_FLAGS "-v -v -v -g0 -fintelfpga -DFPGA_EMULATOR -DNUM_ENGINES=${NUM_ENGINES}") +set(EMULATOR_LINK_FLAGS -fintelfpga) + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} ${CMAKE_CURRENT_SOURCE_DIR}/gzip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/crc32.cpp ${CMAKE_CURRENT_SOURCE_DIR}/WriteGzip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/CompareGzip.cpp -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${DEVICE_SOURCE_FILE} ${HOST_SOURCE_FILE}) +else() + add_executable(${EMULATOR_TARGET} ${DEVICE_SOURCE_FILE} ${HOST_SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + set(DEVICE_FPGA_OBJ "gzipkernel_fpga.o") + set(DEVICE_IMAGE_FPGA_OBJ "gzipkernel_fpga.a") + set(HOST_SOURCE_FILES_WITH_PATH ${CMAKE_CURRENT_SOURCE_DIR}/gzip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/crc32.cpp ${CMAKE_CURRENT_SOURCE_DIR}/WriteGzip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/CompareGzip.cpp) + + add_custom_command(OUTPUT ${DEVICE_FPGA_OBJ} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${DEVICE_FPGA_OBJ} + DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE}) + + set(OBJ_FILES) + foreach(HOST_FILE ${HOST_SOURCE_FILES_WITH_PATH}) + set(HOST_FPGA_OBJ ${HOST_FILE}.o) + add_custom_command(OUTPUT ${HOST_FPGA_OBJ} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${HOST_FILE} -o ${HOST_FPGA_OBJ} + DEPENDS ${HOST_FILE}) + list(APPEND OBJ_FILES ${HOST_FPGA_OBJ}) + endforeach() + + add_custom_command(OUTPUT ${DEVICE_IMAGE_FPGA_OBJ} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link=image ${DEVICE_FPGA_OBJ} -o ${DEVICE_IMAGE_FPGA_OBJ} + DEPENDS ${DEVICE_FPGA_OBJ} ${OBJ_FILES}) + + add_custom_command(OUTPUT ${FPGA_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${FINAL_LINK_FLAGS} ${OBJ_FILES} ${DEVICE_IMAGE_FPGA_OBJ} -o ${CMAKE_BINARY_DIR}/${FPGA_TARGET} + DEPENDS ${DEVICE_IMAGE_FPGA_OBJ} ${OBJ_FILES}) +endif() + +# fpga report +if(WIN32) + add_custom_target(report DEPENDS ${REPORTS_TARGET} ) + + separate_arguments(WIN_FLAGS WINDOWS_COMMAND) + add_custom_command(OUTPUT ${REPORTS_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${WIN_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET} + DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE}) + +else() + add_custom_target(report DEPENDS ${REPORTS_TARGET} ) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} ${DEVICE_SOURCE_FILE} COPYONLY) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/kernels.hpp kernels.hpp COPYONLY) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_HEADER_FILE} ${DEVICE_HEADER_FILE} COPYONLY) + + add_custom_command(OUTPUT ${REPORTS_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${DEVICE_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET} + DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE} kernels.hpp) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu Makefile -o=test.gz + DEPENDS ${TARGET_NAME}.fpga_emu) + diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.cpp new file mode 100755 index 0000000000..b803dee96b --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.cpp @@ -0,0 +1,85 @@ +// ============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of +// California and by the laws of the United States of America. + +#include "CompareGzip.hpp" + +// returns 0 on success, otherwise failure +int CompareGzipFiles( + const std::string + &original_file, // original input file to compare gzip uncompressed + const std::string &input_gzfile) // gzip file to check +{ +#ifdef _MSC_VER + std::cout + << "Info: skipping output verification on Windows, no builtin gunzip\n"; + return 0; +#else + //------------------------------------------------------------------ + // assume all good to start with. + + int gzipstatus = 0; + + //------------------------------------------------------------------ + // Create temporary output filename for gunzip + + char tmp_name[] = "/tmp/gzip_fpga.XXXXXX"; + mkstemp(tmp_name); + std::string outputfile = tmp_name; + + //------------------------------------------------------------------ + // Check that the original file and gzipped file exist. + + //------------------------------------------------------------------ + // gunzip the file produced to stdout, capturing to the temp file. + + std::string cmd = "gunzip -c "; + cmd += input_gzfile; + cmd += " > " + outputfile; + + int gzout = ::system(cmd.c_str()); + if (gzout != 0) { + gzipstatus = 3; + } + + //------------------------------------------------------------------ + // diff the temp file and the original. + + cmd = "diff -q " + outputfile + " " + original_file; + int diffout = ::system(cmd.c_str()); + if (diffout != 0) { + gzipstatus = 4; + } + + //------------------------------------------------------------------ + // Cleanup, remove the temp file. + + (void)::remove(outputfile.c_str()); + + return gzipstatus; +#endif +} diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.hpp new file mode 100755 index 0000000000..5624b97cea --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.hpp @@ -0,0 +1,41 @@ +// ============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of +// California and by the laws of the United States of America. + +#ifndef __COMPAREGZIP_H__ +#define __COMPAREGZIP_H__ +#pragma once + +#include +#include + +int CompareGzipFiles( + const std::string + &original_file, // original input file to compare gzip uncompressed + const std::string &input_gzfile); // gzip file to check + +#endif //__COMPAREGZIP_H__ diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.cpp new file mode 100755 index 0000000000..71c370aa96 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.cpp @@ -0,0 +1,163 @@ +// ============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of +// California and by the laws of the United States of America. + +#define _CRT_SECURE_NO_WARNINGS +#include "WriteGzip.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +constexpr int kDeflated = 8; +#define GZIP_MAGIC "\037\213" // Magic header for gzip files, 1F 8B + +#define ORIG_NAME 0x08 +#define OS_CODE 0x03 // Unix OS_CODE + +typedef struct GzipHeader { + unsigned char magic[2]; // 0x1f, 0x8b + unsigned char compress_method; // 0-7 reserved, 8=deflate -- kDeflated + unsigned char flags; // b0: file probably ascii + // b1: header crc-16 present + // b2: extra field present + // b3: original file name present + // b4: file comment present + // b5,6,7: reserved + unsigned long time; // file modification time in Unix format. + // Set this to 0 for now. + + unsigned char extra; // depends on compression method + unsigned char os; // operating system on which compression took place + + // ... + // ? bytes ... compressd data ... + + unsigned long crc; + unsigned long uncompressed_sz; + +} gzip_header, *pgzip_header; + +inline static void PutUlong(uint8_t *pc, unsigned long l) { + pc[0] = l & 0xff; + pc[1] = (l >> 8) & 0xff; + pc[2] = (l >> 16) & 0xff; + pc[3] = (l >> 24) & 0xff; +} + +// returns 0 on success, otherwise failure +int WriteBlockGzip( + std::string &original_filename, // Original file name being compressed + std::string &out_filename, // gzip filename + char *obuf, // pointer to compressed data block + size_t blen, // length of compressed data block + size_t ilen, // original block length + uint32_t buffer_crc) // the block's crc +{ + //------------------------------------------------------------------ + // Setup the gzip output file header. + // max filename size is arbitrarily set to 256 bytes long + // Method is always DEFLATE + // Original filename is always set in header + // timestamp is set to 0 - ignored by gunzip + // deflate flags set to 0 + // OS code is 0 + + int max_filename_sz = 256; + + unsigned char *pgziphdr = + (unsigned char *)malloc(sizeof(gzip_header) + max_filename_sz); + + if (!pgziphdr) { + std::cout << "pgzip header cannot be allocated\n"; + return 1; + } + + pgziphdr[0] = GZIP_MAGIC[0]; + pgziphdr[1] = GZIP_MAGIC[1]; + pgziphdr[2] = kDeflated; + pgziphdr[3] = ORIG_NAME; + + // Set time in header to 0, this is ignored by gunzip. + pgziphdr[4] = 0; + pgziphdr[5] = 0; + pgziphdr[6] = 0; + pgziphdr[7] = 0; + + // Deflate flags + pgziphdr[8] = 0; + + // OS code is Linux in this case. + pgziphdr[9] = OS_CODE; + + int ondx = 10; + + const char *p = original_filename.c_str(); + do { + pgziphdr[ondx++] = (*p); + } while (*p++); + + int header_bytes = ondx; + + unsigned char prolog[8]; + + PutUlong(((unsigned char *)prolog), buffer_crc); + PutUlong(((unsigned char *)&prolog[4]), ilen); + + FILE *fo = fopen(out_filename.c_str(), "w+"); + if (ferror(fo)) { + std::cout << "Cannot open file for output: " << out_filename << "\n"; + free(pgziphdr); + return 1; + } + + fwrite(pgziphdr, 1, header_bytes, fo); + fwrite(obuf, 1, blen, fo); + fwrite(prolog, 1, 8, fo); + + if (ferror(fo)) { + std::cout << "gzip output file write failure.\n"; + free(pgziphdr); + return 1; + } + + if (fclose(fo)) { + perror("close"); + free(pgziphdr); + return 1; + } + free(pgziphdr); + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.hpp new file mode 100755 index 0000000000..66bc28e315 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.hpp @@ -0,0 +1,45 @@ +// ============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of +// California and by the laws of the United States of America. + +#ifndef __WRITEGZIP_H__ +#define __WRITEGZIP_H__ +#pragma once + +#include +#include + +// returns 0 on success, otherwise failure +int WriteBlockGzip( + std::string &original_filename, // Original file name being compressed + std::string &out_filename, // gzip filename + char *obuf, // pointer to compressed data block + size_t blen, // length of compressed data block + size_t ilen, // original block length + uint32_t buffer_crc); // the block's crc + +#endif //__WRITEGZIP_H__ diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/build.ninja b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/build.ninja new file mode 100755 index 0000000000..29d50e63a0 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/build.ninja @@ -0,0 +1,32 @@ +device_source_file = gzipkernel.cpp +device_header_file = gzipkernel.h +host_source_file = gzip.cpp crc32.cpp WriteGzip.cpp CompareGzip.cpp +target_name = gzip + +emulator_target = ${target_name}.fpga_emu.exe +report_target = ${target_name}_report.a +report_target_s10_pac = ${target_name}_s10_pac_report.a + +hardware_flags = -fintelfpga -Xshardware -Xsclock=280MHz -Xsparallel=2 -Xsseed=1 +emulator_flags = -fintelfpga -DFPGA_EMULATOR + +rule build_fpga_emu + command = dpcpp /GX ${emulator_flags} ${device_source_file} ${host_source_file} -o $out + +rule gen_report + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 ${device_source_file} ${host_source_file} -fsycl-link -o $out + +rule gen_report_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 ${device_source_file} ${host_source_file} -fsycl-link -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu + +# report +build report: phony ${report_target} +build ${report_target}: gen_report + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_s10_pac} +build ${report_target_s10_pac}: gen_report_s10_pac diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.cpp new file mode 100755 index 0000000000..8e6c59c734 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.cpp @@ -0,0 +1,126 @@ +// ============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of +// California and by the laws of the United States of America. + +/* + * Copyright (C) 1995-2006, 2010, 2011, 2012, 2016 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "crc32.hpp" + +// This table is CRC32s for all single byte values created by using the +// makecrc.c utility from gzip for compatibility with gzip. makecrc.c can be +// found in the gzip source code project found at +// https://git.savannah.gnu.org/git/gzip.git. The polynomial 0xedb88320 is used +// for gzip, and thus used to create this table. +// +// Not copyrighted 1990, Mark Adler. +// +const unsigned int crc32_table[] = { + 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, + 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, + 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, + 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, + 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, + 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, + 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, + 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, + 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, + 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, + 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, + 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, + 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, + 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, + 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, + 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, + 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, + 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, + 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, + 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, + 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, + 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, + 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, + 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, + 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, + 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, + 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, + 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, + 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, + 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, + 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, + 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, + 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, + 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, + 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, + 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, + 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, + 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, + 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, + 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, + 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, + 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, + 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, + 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, + 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, + 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, + 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, + 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, + 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, + 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, + 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, + 0x2d02ef8dL}; + +// +// This routine creates a Crc32 from a memory buffer (address, and length), and +// a previous crc. This routine can be called iteratively on different portions +// of the same buffer, using a previously returned crc value. The +// value 0xffffffff is used for the first buffer invocation. +unsigned int Crc32Host( + const char *pbuf, // pointer to the buffer to crc + size_t sz, // number of bytes + unsigned int previous_crc) // previous CRC, allows combining. +{ + unsigned int curr_crc = ~previous_crc; + if (sz) do { + curr_crc = + crc32_table[((int)curr_crc ^ (*pbuf++)) & 0xff] ^ (curr_crc >> 8); + } while (--sz); + return curr_crc ^ 0xffffffffL; +} + +unsigned int Crc32(const char *in, size_t buffer_sz, + unsigned int previous_crc) { + const int num_nibbles_parallel = 64; + const int num_sections = + buffer_sz / (num_nibbles_parallel / 2); // how many loop iterations + // now deal with the remainder, this should be done on the software host + // the post-invert also happens inside crc_reference + const char *remaining_data = &in[num_sections * (num_nibbles_parallel / 2)]; + int remaining_bytes = buffer_sz % (num_nibbles_parallel / 2); + return Crc32Host(remaining_data, remaining_bytes, previous_crc); +} diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.hpp new file mode 100755 index 0000000000..138a8f0754 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.hpp @@ -0,0 +1,46 @@ +// ============================================================== +// Copyright Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of +// California and by the laws of the United States of America. + +#ifndef __CRC32_H__ +#define __CRC32_H__ +#pragma once + +#include +#include + +uint32_t Crc32Host( + const char *pbuf, // pointer to the buffer to crc + size_t sz, // number of bytes + uint32_t previous_crc); // previous CRC, allows combining. First invocation + // would use 0xffffffff. +uint32_t Crc32(const char *pbuf, // pointer to the buffer to crc + size_t sz, // number of bytes + uint32_t previous_crc); // previous CRC, allows combining. First + // invocation would use 0xffffffff. + +#endif //__CRC32_H__ diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzip.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzip.cpp new file mode 100755 index 0000000000..9ecfe11728 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzip.cpp @@ -0,0 +1,520 @@ +// ============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of +// California and by the laws of the United States of America. + +#include +#include +#include +#include +#include + +#include "CompareGzip.hpp" +#include "WriteGzip.hpp" +#include "crc32.hpp" +#include "dpc_common.hpp" +#include "gzipkernel.hpp" +#include "kernels.hpp" + +using namespace sycl; + +// The minimum file size of a file to be compressed. +// Any filesize less than this results in an error. +constexpr int minimum_filesize = kVec + 1; + +bool help = false; + +int CompressFile(queue &q, std::string &input_file, std::vector outfilenames, + int iterations, bool report); + +void Help(void) { + // Command line arguments. + // gzip [options] filetozip [options] + // -h,--help : help + + // future options? + // -p,performance : output perf metrics + // -m,maxmapping=# : maximum mapping size + + std::cout << "gzip filename [options]\n"; + std::cout << " -h,--help : this help text\n"; + std::cout + << " -o=,--output-file= : specify output file\n"; +} + +bool FindGetArg(std::string &arg, const char *str, int defaultval, int *val) { + std::size_t found = arg.find(str, 0, strlen(str)); + if (found != std::string::npos) { + int value = atoi(&arg.c_str()[strlen(str)]); + *val = value; + return true; + } + return false; +} + +constexpr int kMaxStringLen = 40; + +bool FindGetArgString(std::string &arg, const char *str, char *str_value, + size_t maxchars) { + std::size_t found = arg.find(str, 0, strlen(str)); + if (found != std::string::npos) { + const char *sptr = &arg.c_str()[strlen(str)]; + for (int i = 0; i < maxchars - 1; i++) { + char ch = sptr[i]; + switch (ch) { + case ' ': + case '\t': + case '\0': + str_value[i] = 0; + return true; + break; + default: + str_value[i] = ch; + break; + } + } + return true; + } + return false; +} + +size_t SyclGetExecTimeNs(event e) { + size_t start_time = + e.get_profiling_info(); + size_t end_time = + e.get_profiling_info(); + return (end_time - start_time); +} + +int main(int argc, char *argv[]) { + std::string infilename = ""; + + std::vector outfilenames (kNumEngines); + + char str_buffer[kMaxStringLen] = {0}; + + // Check the number of arguments specified + if (argc != 3) { + std::cerr << "Incorrect number of arguments. Correct usage: " << argv[0] + << " -o=\n"; + return 1; + } + + for (int i = 1; i < argc; i++) { + if (argv[i][0] == '-') { + std::string sarg(argv[i]); + if (std::string(argv[i]) == "-h") { + help = true; + } + if (std::string(argv[i]) == "--help") { + help = true; + } + + FindGetArgString(sarg, "-o=", str_buffer, kMaxStringLen); + FindGetArgString(sarg, "--output-file=", str_buffer, kMaxStringLen); + } else { + infilename = std::string(argv[i]); + } + } + + if (help) { + Help(); + return 1; + } + + try { +#ifdef FPGA_EMULATOR + intel::fpga_emulator_selector device_selector; +#else + intel::fpga_selector device_selector; +#endif + auto prop_list = property_list{property::queue::enable_profiling()}; + queue q(device_selector, dpc_common::exception_handler, prop_list); + + std::cout << "Running on device: " + << q.get_device().get_info().c_str() << "\n"; + + if (infilename == "") { + std::cout << "Must specify a filename to compress\n\n"; + Help(); + return 1; + } + + // next, check valid and acceptable parameter ranges. + // if output filename not set, use the default + // name, else use the name specified by the user + outfilenames[0] = std::string(infilename) + ".gz"; + if (strlen(str_buffer)) { + outfilenames[0] = std::string(str_buffer); + } + for (size_t i=1; i< kNumEngines; i++) { + // Filenames will be of the form outfilename, outfilename2, outfilename3 etc. + outfilenames[i] = outfilenames[0] + std::to_string(i+1); + } + + std::cout << "Launching GZIP application with " << kNumEngines + << " engines\n"; + +#ifdef FPGA_EMULATOR + CompressFile(q, infilename, outfilenames, 1, true); +#else + // warmup run - use this run to warmup accelerator. There are some steps in + // the runtime that are only executed on the first kernel invocation but not + // on subsequent invocations. So execute all that stuff here before we + // measure performance (in the next call to CompressFile(). + CompressFile(q, infilename, outfilenames, 1, false); + // profile performance + CompressFile(q, infilename, outfilenames, 200, true); +#endif + } catch (sycl::exception const &e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + return 0; +} + +struct KernelInfo { + buffer *gzip_out_buf; + buffer *current_crc; + buffer *pobuf; + buffer *pibuf; + char *pobuf_decompress; + + uint32_t buffer_crc[kMinBufferSize]; + uint32_t refcrc; + + const char *pref_buffer; + char *poutput_buffer; + size_t file_size; + struct GzipOutInfo out_info[kMinBufferSize]; + int iteration; + bool last_block; +}; + +// returns 0 on success, otherwise a non-zero failure code. +int CompressFile(queue &q, std::string &input_file, std::vector outfilenames, + int iterations, bool report) { + size_t isz; + char *pinbuf; + + // Read the input file + std::string device_string = + q.get_device().get_info().c_str(); + bool prepin = + (device_string.find("s10") != + std::string::npos); // Check if "s10" is found in the device string. If + // the device is S10, we pre-pin some buffers to + // improve DMA performance, which is needed to + // achieve peak kernel throughput. Pre-pinning is + // only supported on the PAC-S10 BSP. It's not + // needed on PAC-A10 to achieve peak performance. + + std::ifstream file(input_file, + std::ios::in | std::ios::binary | std::ios::ate); + if (file.is_open()) { + isz = file.tellg(); + if (prepin) { + pinbuf = (char *)malloc_host( + isz, q.get_context()); // Pre-pin the buffer, for faster DMA + } else { // throughput, using malloc_host(). + pinbuf = new char[isz]; + } + file.seekg(0, std::ios::beg); + file.read(pinbuf, isz); + file.close(); + } else { + std::cout << "Error: cannot read specified input file\n"; + return 1; + } + + if (isz < minimum_filesize) { + std::cout << "Minimum filesize for compression is " << minimum_filesize + << "\n"; + return 1; + } + + int buffers_count = iterations; + + // Create an array of kernel info structures and create buffers for kernel + // input/output. The buffers are re-used between iterations, but enough + // disjoint buffers are created to support double-buffering. + struct KernelInfo *kinfo[kNumEngines]; + for (size_t eng = 0; eng < kNumEngines; eng++) { + kinfo[eng] = + (struct KernelInfo *)malloc(sizeof(struct KernelInfo) * buffers_count); + if (kinfo[eng] == NULL) { + std::cout << "Cannot allocate kernel info buffer.\n"; + return 1; + } + for (int i = 0; i < buffers_count; i++) { + kinfo[eng][i].file_size = isz; + // Allocating slightly larger buffers (+ 16 * kVec) to account for + // granularity of kernel writes + int outputSize = kinfo[eng][i].file_size + 16 * kVec < kMinBufferSize + ? kMinBufferSize + : kinfo[eng][i].file_size + 16 * kVec; + + // Pre-pin buffer using malloc_host() to improve DMA bandwidth. + if (i >= 3) { + kinfo[eng][i].poutput_buffer = kinfo[eng][i - 3].poutput_buffer; + } else { + if (prepin) { + kinfo[eng][i].poutput_buffer = + (char *)malloc_host(outputSize, q.get_context()); + } else { + kinfo[eng][i].poutput_buffer = (char *)malloc(outputSize); + } + if (kinfo[eng][i].poutput_buffer == NULL) { + std::cout << "Cannot allocate output buffer.\n"; + free(kinfo); + return 1; + } + // zero pages to fully allocate them + memset(kinfo[eng][i].poutput_buffer, 0, outputSize); + } + + kinfo[eng][i].last_block = true; + kinfo[eng][i].iteration = i; + kinfo[eng][i].pref_buffer = pinbuf; + + kinfo[eng][i].gzip_out_buf = + i >= 3 ? kinfo[eng][i - 3].gzip_out_buf + : new buffer(kMinBufferSize); + kinfo[eng][i].current_crc = i >= 3 + ? kinfo[eng][i - 3].current_crc + : new buffer(kMinBufferSize); + kinfo[eng][i].pibuf = i >= 3 + ? kinfo[eng][i - 3].pibuf + : new buffer(kinfo[eng][i].file_size); + kinfo[eng][i].pobuf = + i >= 3 ? kinfo[eng][i - 3].pobuf : new buffer(outputSize); + kinfo[eng][i].pobuf_decompress = (char *)malloc(kinfo[eng][i].file_size); + } + } + + // Create events for the various parts of the execution so that we can profile + // their performance. + event e_input_dma [kNumEngines][buffers_count]; // Input to the GZIP engine. This is a transfer from host to device. + event e_output_dma [kNumEngines][buffers_count]; // Output from the GZIP engine. This is transfer from device to host. + event e_crc_dma [kNumEngines][buffers_count]; // Transfer CRC from device to host + event e_size_dma [kNumEngines][buffers_count]; // Transfer compressed file size from device to host + event e_k_crc [kNumEngines][buffers_count]; // CRC kernel + event e_k_lz [kNumEngines][buffers_count]; // LZ77 kernel + event e_k_huff [kNumEngines][buffers_count]; // Huffman Encoding kernel + +#ifndef FPGA_EMULATOR + dpc_common::TimeInterval perf_timer; +#endif + + + /*************************************************/ + /* Main loop where the actual execution happens */ + /*************************************************/ + for (int i = 0; i < buffers_count; i++) { + for (size_t eng = 0; eng < kNumEngines; eng++) { + // Transfer the input data, to be compressed, from host to device. + e_input_dma[eng][i] = q.submit([&](handler &h) { + auto in_data = + kinfo[eng][i].pibuf->get_access(h); + h.copy(kinfo[eng][i].pref_buffer, in_data); + }); + + /************************************/ + /************************************/ + /* LAUNCH GZIP ENGINE */ + /************************************/ + /************************************/ + SubmitGzipTasks(q, kinfo[eng][i].file_size, kinfo[eng][i].pibuf, + kinfo[eng][i].pobuf, kinfo[eng][i].gzip_out_buf, + kinfo[eng][i].current_crc, kinfo[eng][i].last_block, + e_k_crc[eng][i], e_k_lz[eng][i], e_k_huff[eng][i], eng); + + // Transfer the output (compressed) data from device to host. + e_output_dma[eng][i] = q.submit([&](handler &h) { + auto out_data = kinfo[eng][i].pobuf->get_access(h); + h.copy(out_data, kinfo[eng][i].poutput_buffer); + }); + + // Transfer the file size of the compressed output file from device to host. + e_size_dma[eng][i] = q.submit([&](handler &h) { + auto out_data = + kinfo[eng][i].gzip_out_buf->get_access(h); + h.copy(out_data, kinfo[eng][i].out_info); + }); + + // Transfer the CRC of the compressed output file from device to host. + e_crc_dma[eng][i] = q.submit([&](handler &h) { + auto out_data = + kinfo[eng][i].current_crc->get_access(h); + h.copy(out_data, kinfo[eng][i].buffer_crc); + }); + } + } + + // Wait for all kernels to complete + for (int eng = 0; eng < kNumEngines; eng++) { + for (int i = 0; i < buffers_count; i++) { + e_output_dma[eng][i].wait(); + e_size_dma[eng][i].wait(); + e_crc_dma[eng][i].wait(); + } + } + +// Stop the timer. +#ifndef FPGA_EMULATOR + double diff_total = perf_timer.Elapsed(); + double gbps = iterations * isz / (double)diff_total / 1000000000.0; +#endif + + // Check the compressed file size from each iteration. Make sure the size is actually + // less-than-or-equal to the input size. Also calculate the remaining CRC. + size_t compressed_sz[kNumEngines]; + for (int eng = 0; eng < kNumEngines; eng++) { + compressed_sz[eng] = 0; + for (int i = 0; i < buffers_count; i++) { + if (kinfo[eng][i].out_info[0].compression_sz > kinfo[eng][i].file_size) { + std::cerr << "Unsupported: compressed file larger than input file( " + << kinfo[eng][i].out_info[0].compression_sz << " )\n"; + return 1; + } + // The majority of the CRC is calculated by the CRC kernel on the FPGA. But the kernel + // operates on quantized chunks of input data, so any remaining input data, that falls + // outside the quanta, is included in the overall CRC calculation via the following + // function that runs on the host. The last argument is the running CRC that was computed + // on the FPGA. + kinfo[eng][i].buffer_crc[0] = + Crc32(kinfo[eng][i].pref_buffer, kinfo[eng][i].file_size, + kinfo[eng][i].buffer_crc[0]); + // Accumulate the compressed size across all iterations. Used to + // compute compression ratio later. + compressed_sz[eng] += kinfo[eng][i].out_info[0].compression_sz; + } + } + + // delete the file mapping now that all kernels are complete, and we've + // snapped the time delta + if (prepin) { + free(pinbuf, q.get_context()); + } else { + delete pinbuf; + } + + // Write the output compressed data from the first iteration of each engine, to a file. + for (int eng = 0; eng < kNumEngines; eng++) { + // WriteBlockGzip() returns 1 on failure + if (report && WriteBlockGzip(input_file, outfilenames[eng], kinfo[eng][0].poutput_buffer, + kinfo[eng][0].out_info[0].compression_sz, + kinfo[eng][0].file_size, kinfo[eng][0].buffer_crc[0])) { + std::cout << "FAILED\n"; + return 1; + } + } + + // Decompress the output from engine-0 and compare against the input file. Only engine-0's + // output is verified since all engines are fed the same input data. + if (report && CompareGzipFiles(input_file, outfilenames[0])) { + std::cout << "FAILED\n"; + return 1; + } + + // Generate throughput report + // First gather all the execution times. + size_t time_k_crc[kNumEngines]; + size_t time_k_lz[kNumEngines]; + size_t time_k_huff[kNumEngines]; + size_t time_input_dma[kNumEngines]; + size_t time_output_dma[kNumEngines]; + for (int eng = 0; eng < kNumEngines; eng++) { + time_k_crc[eng] = 0; + time_k_lz[eng] = 0; + time_k_huff[eng] = 0; + time_input_dma[eng] = 0; + time_output_dma[eng] = 0; + for (int i = 0; i < buffers_count; i++) { + e_k_crc[eng][i].wait(); + e_k_lz[eng][i].wait(); + e_k_huff[eng][i].wait(); + time_k_crc[eng] += SyclGetExecTimeNs(e_k_crc[eng][i]); + time_k_lz[eng] += SyclGetExecTimeNs(e_k_lz[eng][i]); + time_k_huff[eng] += SyclGetExecTimeNs(e_k_huff[eng][i]); + time_input_dma[eng] += SyclGetExecTimeNs(e_input_dma[eng][i]); + time_output_dma[eng] += SyclGetExecTimeNs(e_output_dma[eng][i]); + } + } + + if (report) { + double compression_ratio = + (double)((double)compressed_sz[0] / (double)isz / iterations); +#ifndef FPGA_EMULATOR + std::cout << "Throughput: " << kNumEngines * gbps << " GB/s\n\n"; + for (int eng = 0; eng < kNumEngines; eng++) { + std::cout << "TP breakdown for engine #" << eng << " (GB/s)\n"; + std::cout << "CRC = " << iterations * isz / (double)time_k_crc[eng] + << "\n"; + std::cout << "LZ77 = " << iterations * isz / (double)time_k_lz[eng] + << "\n"; + std::cout << "Huffman Encoding = " + << iterations * isz / (double)time_k_huff[eng] << "\n"; + std::cout << "DMA host-to-device = " + << iterations * isz / (double)time_input_dma[eng] << "\n"; + std::cout << "DMA device-to-host = " + << iterations * isz / (double)time_output_dma[eng] << "\n\n"; + } +#endif + std::cout << "Compression Ratio " << compression_ratio * 100 << "%\n"; + } + + // Cleanup anything that was allocated by this routine. + for (int eng = 0; eng < kNumEngines; eng++) { + for (int i = 0; i < buffers_count; i++) { + if (i < 3) { + delete kinfo[eng][i].gzip_out_buf; + delete kinfo[eng][i].current_crc; + delete kinfo[eng][i].pibuf; + delete kinfo[eng][i].pobuf; + if (prepin) { + free(kinfo[eng][i].poutput_buffer, q.get_context()); + } else { + free(kinfo[eng][i].poutput_buffer); + } + } + free(kinfo[eng][i].pobuf_decompress); + } + free(kinfo[eng]); + } + + if (report) std::cout << "PASSED\n"; + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.cpp new file mode 100755 index 0000000000..01d69c1f9b --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.cpp @@ -0,0 +1,2406 @@ +// ============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of +// California and by the laws of the United States of America. + +/* + * Copyright (C) 1995-2006, 2010, 2011, 2012, 2016 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include + +#include "gzipkernel.hpp" +#include "kernels.hpp" + +using namespace sycl; + +// This reference design uses a template-based unroller. It's also possible +// to specify this in a more concise way using a pragma. See the loop unroll +// tutorial for more information. +template +struct Unroller { + template + static void step(const Action &action) { + action(Begin); + Unroller::step(action); + } +}; + +template +struct Unroller { + template + static void step(const Action &action) {} +}; + +int GetHuffLiteralBits(unsigned char ch) { + CtData static_ltree[kLCodes + 2] = { + {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8}, + {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8}, + {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8}, + {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8}, + {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8}, + {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8}, + {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8}, + {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8}, + {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8}, + {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8}, + {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8}, + {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8}, + {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8}, + {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8}, + {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8}, + {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8}, + {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8}, + {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8}, + {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8}, + {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8}, + {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9}, + {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9}, + {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9}, + {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9}, + {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9}, + {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9}, + {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9}, + {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9}, + {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9}, + {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9}, + {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9}, + {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9}, + {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9}, + {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9}, + {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9}, + {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9}, + {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7}, + {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7}, + {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7}, + {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7}, + {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8}, + {227, 8}, + }; + return static_ltree[ch].code; +} + +int GetHuffLiteralLen(unsigned char ch) { + CtData static_ltree[kLCodes + 2] = { + {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8}, + {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8}, + {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8}, + {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8}, + {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8}, + {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8}, + {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8}, + {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8}, + {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8}, + {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8}, + {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8}, + {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8}, + {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8}, + {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8}, + {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8}, + {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8}, + {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8}, + {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8}, + {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8}, + {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8}, + {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9}, + {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9}, + {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9}, + {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9}, + {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9}, + {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9}, + {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9}, + {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9}, + {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9}, + {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9}, + {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9}, + {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9}, + {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9}, + {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9}, + {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9}, + {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9}, + {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7}, + {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7}, + {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7}, + {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7}, + {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8}, + {227, 8}, + }; + return static_ltree[ch].len; +} + +int GetHuffRunLen(int len, int initial_dist) { + int lc; + unsigned code; + int extra; + int dist; + int local_lbits, local_llen; + int local_dbits, local_dlen; + local_lbits = 0; + local_llen = 0; + + int base_length[kLengthCodes] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, + 28, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 0, + }; + + int extra_lbits[kLengthCodes] // extra bits for each length code + = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, + 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0}; + + CtData static_ltree[kLCodes + 2] = { + {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8}, + {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8}, + {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8}, + {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8}, + {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8}, + {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8}, + {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8}, + {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8}, + {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8}, + {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8}, + {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8}, + {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8}, + {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8}, + {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8}, + {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8}, + {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8}, + {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8}, + {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8}, + {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8}, + {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8}, + {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9}, + {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9}, + {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9}, + {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9}, + {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9}, + {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9}, + {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9}, + {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9}, + {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9}, + {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9}, + {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9}, + {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9}, + {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9}, + {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9}, + {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9}, + {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9}, + {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7}, + {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7}, + {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7}, + {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7}, + {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8}, + {227, 8}, + }; + + // distance codes. The first 256 values correspond to the distances + // 3 .. 258, the last 256 values correspond to the top 8 bits of + // the 15 bit distances. + unsigned char dist_code[512] = { + 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, + 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 0, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, + 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, + }; + // length code for each normalized match length (0 == kMinMatch) + unsigned char length_code[kMaxMatch - kMinMatch + 1] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, + 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, + 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, + 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 28, + }; + + int extra_dbits[kDCodes] // extra bits for each distance code + = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13}; + + int base_dist[kDCodes] = { + 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, + 32, 48, 64, 96, 128, 192, 256, 384, 512, 768, + 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, + }; + + CtData static_dtree[kDCodes] = { + {0, 5}, {16, 5}, {8, 5}, {24, 5}, {4, 5}, {20, 5}, {12, 5}, {28, 5}, + {2, 5}, {18, 5}, {10, 5}, {26, 5}, {6, 5}, {22, 5}, {14, 5}, {30, 5}, + {1, 5}, {17, 5}, {9, 5}, {25, 5}, {5, 5}, {21, 5}, {13, 5}, {29, 5}, + {3, 5}, {19, 5}, {11, 5}, {27, 5}, {7, 5}, {23, 5}, + }; + + lc = len - kMinMatch; + code = length_code[lc]; + + local_lbits = static_ltree[code + kLiterals + 1].code; + local_llen = static_ltree[code + kLiterals + 1].len; + extra = extra_lbits[code]; + if (extra) { + lc -= base_length[code]; + local_lbits |= lc << local_llen; + local_llen += extra; + } + + dist = initial_dist; + dist--; + code = d_code(dist); + local_dbits = static_dtree[code].code; + local_dlen = static_dtree[code].len; + extra = extra_dbits[code]; + if (extra) { + dist -= base_dist[code]; + local_dbits |= dist << local_dlen; + local_dlen += extra; + } + + local_lbits |= local_dbits << local_llen; + local_llen += local_dlen; + + return local_llen; +} + +int GetHuffRunBits(int len, int initial_dist) { + int lc; + unsigned code; + int extra; + int dist; + int local_lbits, local_llen; + int local_dbits, local_dlen; + local_lbits = 0; + local_llen = 0; + + int base_length[kLengthCodes] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, + 28, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 0, + }; + + int extra_lbits[kLengthCodes] // extra bits for each length code + = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, + 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0}; + + CtData static_ltree[kLCodes + 2] = { + {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8}, + {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8}, + {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8}, + {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8}, + {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8}, + {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8}, + {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8}, + {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8}, + {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8}, + {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8}, + {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8}, + {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8}, + {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8}, + {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8}, + {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8}, + {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8}, + {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8}, + {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8}, + {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8}, + {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8}, + {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9}, + {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9}, + {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9}, + {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9}, + {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9}, + {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9}, + {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9}, + {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9}, + {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9}, + {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9}, + {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9}, + {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9}, + {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9}, + {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9}, + {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9}, + {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9}, + {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7}, + {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7}, + {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7}, + {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7}, + {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8}, + {227, 8}, + }; + + // distance codes. The first 256 values correspond to the distances + // 3 .. 258, the last 256 values correspond to the top 8 bits of + // the 15 bit distances. + unsigned char dist_code[512] = { + 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, + 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 0, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, + 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, + 29, 29, 29, 29, 29, 29, 29, 29, + }; + // length code for each normalized match length (0 == kMinMatch) + unsigned char length_code[kMaxMatch - kMinMatch + 1] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, + 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, + 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, + 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 28, + }; + + int extra_dbits[kDCodes] // extra bits for each distance code + = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13}; + + int base_dist[kDCodes] = { + 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, + 32, 48, 64, 96, 128, 192, 256, 384, 512, 768, + 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576, + }; + + CtData static_dtree[kDCodes] = { + {0, 5}, {16, 5}, {8, 5}, {24, 5}, {4, 5}, {20, 5}, {12, 5}, {28, 5}, + {2, 5}, {18, 5}, {10, 5}, {26, 5}, {6, 5}, {22, 5}, {14, 5}, {30, 5}, + {1, 5}, {17, 5}, {9, 5}, {25, 5}, {5, 5}, {21, 5}, {13, 5}, {29, 5}, + {3, 5}, {19, 5}, {11, 5}, {27, 5}, {7, 5}, {23, 5}, + }; + + lc = len - kMinMatch; + code = length_code[lc]; + + local_lbits = static_ltree[code + kLiterals + 1].code; + local_llen = static_ltree[code + kLiterals + 1].len; + extra = extra_lbits[code]; + if (extra) { + lc -= base_length[code]; + local_lbits |= lc << local_llen; + local_llen += extra; + } + + dist = initial_dist; + dist--; + code = d_code(dist); + local_dbits = static_dtree[code].code; + local_dlen = static_dtree[code].len; + extra = extra_dbits[code]; + if (extra) { + dist -= base_dist[code]; + local_dbits |= dist << local_dlen; + local_dlen += extra; + } + + local_lbits |= local_dbits << local_llen; + local_llen += local_dlen; + + return local_lbits; +} + +int GetHuffLen(int len, int dist, unsigned char ch) { + int returned_len; + + CtData static_ltree[kLCodes + 2] = { + {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8}, + {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8}, + {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8}, + {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8}, + {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8}, + {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8}, + {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8}, + {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8}, + {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8}, + {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8}, + {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8}, + {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8}, + {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8}, + {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8}, + {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8}, + {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8}, + {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8}, + {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8}, + {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8}, + {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8}, + {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9}, + {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9}, + {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9}, + {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9}, + {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9}, + {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9}, + {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9}, + {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9}, + {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9}, + {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9}, + {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9}, + {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9}, + {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9}, + {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9}, + {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9}, + {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9}, + {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7}, + {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7}, + {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7}, + {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7}, + {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8}, + {227, 8}, + }; + switch (len) { + case -3: + returned_len = static_ltree[kEndBlock].len; + break; + case -2: + returned_len = 3; + break; + case -1: + returned_len = 0; + break; + case 0: + returned_len = GetHuffLiteralLen(ch); + break; + default: + returned_len = GetHuffRunLen(len, dist); + break; + } + return returned_len; +} + +int IsValid(int len, int dist, unsigned char ch) { + switch (len) { + case -3: + return 1; + case -2: + return 1; + case -1: + return 0; + case 0: + return 1; + default: + return 1; + } +} + +int GetHuffBits(int len, int dist, unsigned char ch) { + int bits; + CtData static_ltree[kLCodes + 2] = { + {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8}, + {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8}, + {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8}, + {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8}, + {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8}, + {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8}, + {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8}, + {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8}, + {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8}, + {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8}, + {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8}, + {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8}, + {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8}, + {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8}, + {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8}, + {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8}, + {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8}, + {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8}, + {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8}, + {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8}, + {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9}, + {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9}, + {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9}, + {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9}, + {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9}, + {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9}, + {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9}, + {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9}, + {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9}, + {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9}, + {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9}, + {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9}, + {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9}, + {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9}, + {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9}, + {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9}, + {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7}, + {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7}, + {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7}, + {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7}, + {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8}, + {227, 8}, + }; + switch (len) { + case -3: + bits = static_ltree[kEndBlock].code; + break; + case -2: + bits = ch; + break; + case -1: + bits = 0; + break; + case 0: + bits = GetHuffLiteralBits(ch); + break; + default: + bits = GetHuffRunBits(len, dist); + break; + } + return bits; +} + +// assembles up to kVecX2 unsigned char values based on given huffman encoding +// writes up to kMaxHuffcodeBits * kVecX2 bits to memory +bool HufEnc(char *len, short *dist, unsigned char *data, unsigned int *outdata, + unsigned int *leftover, unsigned short *leftover_size) { + // array that contains the bit position of each symbol + unsigned short bitpos[kVec + 1]; + bitpos[0] = 0; + + Unroller<0, kVec>::step([&](int i) { + bitpos[i + 1] = bitpos[i] + (IsValid(len[i], dist[i], data[i]) + ? GetHuffLen(len[i], dist[i], data[i]) + : 0); + }); + + // leftover is an array that carries huffman encoded data not yet written to + // memory adjust leftover_size with the number of bits to write this time + unsigned short prev_cycle_offset = *leftover_size; + *leftover_size += (bitpos[kVec] & 0x3fff); + + // we'll write this cycle if we have collected enough data (kVec shorts or + // more) + bool write = *leftover_size & (kVec * (kMaxHuffcodeBits * 2)); + + // subtract kVec shorts from leftover size (if it's bigger + // than kVec) because we'll write those out this cycle + *leftover_size &= ~(kVec * (kMaxHuffcodeBits * 2)); + + // Adjust bitpos based on leftover offset from previous cycle + Unroller<0, kVec>::step( + [&](int i) { bitpos[i] += (prev_cycle_offset & 0x3fff); }); + + // Huffman codes have any bit alignement, so they can spill + // onto two shorts in the output array + // use ushort2 to keep each part of the code separate + // Iterate over all codes and construct ushort2 containing + // the code properly aligned + struct Uint2Gzip code[kVec]; + Unroller<0, kVec>::step([&](int i) { + code[i].x = 0; + code[i].y = 0; + }); + + Unroller<0, kVec>::step([&](int i) { + // Codes can be more than 16 bits, so use uint32 + unsigned int curr_code = GetHuffBits(len[i], dist[i], data[i]); + unsigned char bitpos_in_short = bitpos[i] & 0x01F; + + unsigned long long temp = (unsigned long long)curr_code << bitpos_in_short; + unsigned int temp1 = (unsigned int)temp; + unsigned int temp2 = temp >> 32ULL; + + if (IsValid(len[i], dist[i], data[i])) { + code[i].x = temp1; + code[i].y = temp2; + } else { + code[i].x = temp1; + code[i].y = temp2; + } + }); + + // Iterate over all destination locations and gather the required data + unsigned int new_leftover[kVec]; + Unroller<0, kVec>::step([&](int i) { + new_leftover[i] = 0; + outdata[i] = 0; + + Unroller<0, kVec>::step([&](int j) { + // figure out whether code[j] goes into bucket[i] + bool match_first = ((bitpos[j] >> 5) & (kVec - 1)) == i; + bool match_second = + ((bitpos[j] >> 5) & (kVec - 1)) == ((i - 1) & (kVec - 1)); + + // if code[j] maps onto current bucket then OR its code, else OR with 0 + unsigned int component = + match_first ? code[j].x : (match_second ? code[j].y : 0); + + // overflow from kVec shorts, need to move onto new_leftover + bool use_later = + (bitpos[j] & (kVec * (kMaxHuffcodeBits * 2))) || + (match_second && (((bitpos[j] >> 5) & (kVec - 1)) == kVec - 1)); + + // write to output + outdata[i] |= use_later ? 0 : component; + new_leftover[i] |= use_later ? component : 0; + }); + }); + + // Apply previous leftover on the outdata + // Also, if didn't write, apply prev leftover onto newleftover + Unroller<0, kVec>::step([&](int i) { + outdata[i] |= leftover[i]; + leftover[i] = outdata[i]; + }); + + // Split unroll into two unrolls to avoid compiler crash. This is a temporary + // workaround while awaiting a compiler feature. + if (write) { + Unroller<0, kVec>::step([&](int i) { leftover[i] = new_leftover[i]; }); + } + + return write; +} + +template +class CRC; +template +class LZReduction; +template +class StaticHuffman; +template +void SubmitGzipTasksSingleEngine( + queue &q, + size_t block_size, // size of block to compress. + buffer *pibuf, buffer *pobuf, + buffer *gzip_out_buf, + buffer *result_crc, bool last_block, event &e_crc, event &e_lz, + event &e_huff) { + using acc_dist_channel = intel::pipe; + using acc_dist_channel_last = intel::pipe; + + e_crc = q.submit([&](handler &h) { + auto accessor_isz = block_size; + auto acc_pibuf = pibuf->get_access(h); + auto accresult_crc = result_crc->get_access(h); + h.single_task>([=]() [[intel::kernel_args_restrict]] { + const unsigned int table64[64][16] = { + { + 0x0, + 0xf1da05aa, + 0x38c50d15, + 0xc91f08bf, + 0x718a1a2a, + 0x80501f80, + 0x494f173f, + 0xb8951295, + 0xe3143454, + 0x12ce31fe, + 0xdbd13941, + 0x2a0b3ceb, + 0x929e2e7e, + 0x63442bd4, + 0xaa5b236b, + 0x5b8126c1, + }, + + { + 0x0, + 0x1d596ee9, + 0x3ab2ddd2, + 0x27ebb33b, + 0x7565bba4, + 0x683cd54d, + 0x4fd76676, + 0x528e089f, + 0xeacb7748, + 0xf79219a1, + 0xd079aa9a, + 0xcd20c473, + 0x9faeccec, + 0x82f7a205, + 0xa51c113e, + 0xb8457fd7, + }, + + { + 0x0, + 0xee7e8d1, + 0x1dcfd1a2, + 0x13283973, + 0x3b9fa344, + 0x35784b95, + 0x265072e6, + 0x28b79a37, + 0x773f4688, + 0x79d8ae59, + 0x6af0972a, + 0x64177ffb, + 0x4ca0e5cc, + 0x42470d1d, + 0x516f346e, + 0x5f88dcbf, + }, + + { + 0x0, + 0xee7e8d10, + 0x78c1c61, + 0xe9f29171, + 0xf1838c2, + 0xe166b5d2, + 0x89424a3, + 0xe6eaa9b3, + 0x1e307184, + 0xf04efc94, + 0x19bc6de5, + 0xf7c2e0f5, + 0x11284946, + 0xff56c456, + 0x16a45527, + 0xf8dad837, + }, + + { + 0x0, + 0x3c60e308, + 0x78c1c610, + 0x44a12518, + 0xf1838c20, + 0xcde36f28, + 0x89424a30, + 0xb522a938, + 0x38761e01, + 0x416fd09, + 0x40b7d811, + 0x7cd73b19, + 0xc9f59221, + 0xf5957129, + 0xb1345431, + 0x8d54b739, + }, + + { + 0x0, + 0x70ec3c02, + 0xe1d87804, + 0x91344406, + 0x18c1f649, + 0x682dca4b, + 0xf9198e4d, + 0x89f5b24f, + 0x3183ec92, + 0x416fd090, + 0xd05b9496, + 0xa0b7a894, + 0x29421adb, + 0x59ae26d9, + 0xc89a62df, + 0xb8765edd, + }, + + { + 0x0, + 0x6307d924, + 0xc60fb248, + 0xa5086b6c, + 0x576e62d1, + 0x3469bbf5, + 0x9161d099, + 0xf26609bd, + 0xaedcc5a2, + 0xcddb1c86, + 0x68d377ea, + 0xbd4aece, + 0xf9b2a773, + 0x9ab57e57, + 0x3fbd153b, + 0x5cbacc1f, + }, + + { + 0x0, + 0x86c88d05, + 0xd6e01c4b, + 0x5028914e, + 0x76b13ed7, + 0xf079b3d2, + 0xa051229c, + 0x2699af99, + 0xed627dae, + 0x6baaf0ab, + 0x3b8261e5, + 0xbd4aece0, + 0x9bd34379, + 0x1d1bce7c, + 0x4d335f32, + 0xcbfbd237, + }, + + { + 0x0, + 0x1b5fd1d, + 0x36bfa3a, + 0x2de0727, + 0x6d7f474, + 0x7620969, + 0x5bc0e4e, + 0x409f353, + 0xdafe8e8, + 0xc1a15f5, + 0xec412d2, + 0xf71efcf, + 0xb781c9c, + 0xacde181, + 0x813e6a6, + 0x9a61bbb, + }, + + { + 0x0, + 0x1b5fd1d0, + 0x36bfa3a0, + 0x2de07270, + 0x6d7f4740, + 0x76209690, + 0x5bc0e4e0, + 0x409f3530, + 0xdafe8e80, + 0xc1a15f50, + 0xec412d20, + 0xf71efcf0, + 0xb781c9c0, + 0xacde1810, + 0x813e6a60, + 0x9a61bbb0, + }, + + { + 0x0, + 0x6e8c1b41, + 0xdd183682, + 0xb3942dc3, + 0x61416b45, + 0xfcd7004, + 0xbc595dc7, + 0xd2d54686, + 0xc282d68a, + 0xac0ecdcb, + 0x1f9ae008, + 0x7116fb49, + 0xa3c3bdcf, + 0xcd4fa68e, + 0x7edb8b4d, + 0x1057900c, + }, + + { + 0x0, + 0x5e74ab55, + 0xbce956aa, + 0xe29dfdff, + 0xa2a3ab15, + 0xfcd70040, + 0x1e4afdbf, + 0x403e56ea, + 0x9e36506b, + 0xc042fb3e, + 0x22df06c1, + 0x7cabad94, + 0x3c95fb7e, + 0x62e1502b, + 0x807cadd4, + 0xde080681, + }, + + { + 0x0, + 0xe71da697, + 0x154a4b6f, + 0xf257edf8, + 0x2a9496de, + 0xcd893049, + 0x3fdeddb1, + 0xd8c37b26, + 0x55292dbc, + 0xb2348b2b, + 0x406366d3, + 0xa77ec044, + 0x7fbdbb62, + 0x98a01df5, + 0x6af7f00d, + 0x8dea569a, + }, + + { + 0x0, + 0xaa525b78, + 0x8fd5b0b1, + 0x2587ebc9, + 0xc4da6723, + 0x6e883c5b, + 0x4b0fd792, + 0xe15d8cea, + 0x52c5c807, + 0xf897937f, + 0xdd1078b6, + 0x774223ce, + 0x961faf24, + 0x3c4df45c, + 0x19ca1f95, + 0xb39844ed, + }, + + { + 0x0, + 0xa58b900e, + 0x9066265d, + 0x35edb653, + 0xfbbd4afb, + 0x5e36daf5, + 0x6bdb6ca6, + 0xce50fca8, + 0x2c0b93b7, + 0x898003b9, + 0xbc6db5ea, + 0x19e625e4, + 0xd7b6d94c, + 0x723d4942, + 0x47d0ff11, + 0xe25b6f1f, + }, + + { + 0x0, + 0x5817276e, + 0xb02e4edc, + 0xe83969b2, + 0xbb2d9bf9, + 0xe33abc97, + 0xb03d525, + 0x5314f24b, + 0xad2a31b3, + 0xf53d16dd, + 0x1d047f6f, + 0x45135801, + 0x1607aa4a, + 0x4e108d24, + 0xa629e496, + 0xfe3ec3f8, + }, + + { + 0x0, + 0x81256527, + 0xd93bcc0f, + 0x581ea928, + 0x69069e5f, + 0xe823fb78, + 0xb03d5250, + 0x31183777, + 0xd20d3cbe, + 0x53285999, + 0xb36f0b1, + 0x8a139596, + 0xbb0ba2e1, + 0x3a2ec7c6, + 0x62306eee, + 0xe3150bc9, + }, + + { + 0x0, + 0x7f6b7f3d, + 0xfed6fe7a, + 0x81bd8147, + 0x26dcfab5, + 0x59b78588, + 0xd80a04cf, + 0xa7617bf2, + 0x4db9f56a, + 0x32d28a57, + 0xb36f0b10, + 0xcc04742d, + 0x6b650fdf, + 0x140e70e2, + 0x95b3f1a5, + 0xead88e98, + }, + + { + 0x0, + 0x9b73ead4, + 0xed96d3e9, + 0x76e5393d, + 0x5ca193, + 0x9b2f4b47, + 0xedca727a, + 0x76b998ae, + 0xb94326, + 0x9bcaa9f2, + 0xed2f90cf, + 0x765c7a1b, + 0xe5e2b5, + 0x9b960861, + 0xed73315c, + 0x7600db88, + }, + + { + 0x0, + 0x172864c, + 0x2e50c98, + 0x3978ad4, + 0x5ca1930, + 0x4b89f7c, + 0x72f15a8, + 0x65d93e4, + 0xb943260, + 0xae6b42c, + 0x9713ef8, + 0x803b8b4, + 0xe5e2b50, + 0xf2cad1c, + 0xcbb27c8, + 0xdc9a184, + }, + + { + 0x0, + 0x172864c0, + 0x2e50c980, + 0x3978ad40, + 0x5ca19300, + 0x4b89f7c0, + 0x72f15a80, + 0x65d93e40, + 0xb9432600, + 0xae6b42c0, + 0x9713ef80, + 0x803b8b40, + 0xe5e2b500, + 0xf2cad1c0, + 0xcbb27c80, + 0xdc9a1840, + }, + + { + 0x0, + 0xa9f74a41, + 0x889f92c3, + 0x2168d882, + 0xca4e23c7, + 0x63b96986, + 0x42d1b104, + 0xeb26fb45, + 0x4fed41cf, + 0xe61a0b8e, + 0xc772d30c, + 0x6e85994d, + 0x85a36208, + 0x2c542849, + 0xd3cf0cb, + 0xa4cbba8a, + }, + + { + 0x0, + 0x9fda839e, + 0xe4c4017d, + 0x7b1e82e3, + 0x12f904bb, + 0x8d238725, + 0xf63d05c6, + 0x69e78658, + 0x25f20976, + 0xba288ae8, + 0xc136080b, + 0x5eec8b95, + 0x370b0dcd, + 0xa8d18e53, + 0xd3cf0cb0, + 0x4c158f2e, + }, + + { + 0x0, + 0x4be412ec, + 0x97c825d8, + 0xdc2c3734, + 0xf4e14df1, + 0xbf055f1d, + 0x63296829, + 0x28cd7ac5, + 0x32b39da3, + 0x79578f4f, + 0xa57bb87b, + 0xee9faa97, + 0xc652d052, + 0x8db6c2be, + 0x519af58a, + 0x1a7ee766, + }, + + { + 0x0, + 0x65673b46, + 0xcace768c, + 0xafa94dca, + 0x4eedeb59, + 0x2b8ad01f, + 0x84239dd5, + 0xe144a693, + 0x9ddbd6b2, + 0xf8bcedf4, + 0x5715a03e, + 0x32729b78, + 0xd3363deb, + 0xb65106ad, + 0x19f84b67, + 0x7c9f7021, + }, + + { + 0x0, + 0xe0c6ab25, + 0x1afc500b, + 0xfa3afb2e, + 0x35f8a016, + 0xd53e0b33, + 0x2f04f01d, + 0xcfc25b38, + 0x6bf1402c, + 0x8b37eb09, + 0x710d1027, + 0x91cbbb02, + 0x5e09e03a, + 0xbecf4b1f, + 0x44f5b031, + 0xa4331b14, + }, + + { + 0x0, + 0xd7e28058, + 0x74b406f1, + 0xa35686a9, + 0xe9680de2, + 0x3e8a8dba, + 0x9ddc0b13, + 0x4a3e8b4b, + 0x9a11d85, + 0xde439ddd, + 0x7d151b74, + 0xaaf79b2c, + 0xe0c91067, + 0x372b903f, + 0x947d1696, + 0x439f96ce, + }, + + { + 0x0, + 0x13423b0a, + 0x26847614, + 0x35c64d1e, + 0x4d08ec28, + 0x5e4ad722, + 0x6b8c9a3c, + 0x78cea136, + 0x9a11d850, + 0x8953e35a, + 0xbc95ae44, + 0xafd7954e, + 0xd7193478, + 0xc45b0f72, + 0xf19d426c, + 0xe2df7966, + }, + + { + 0x0, + 0xef52b6e1, + 0x5d46b83, + 0xea86dd62, + 0xba8d706, + 0xe4fa61e7, + 0xe7cbc85, + 0xe12e0a64, + 0x1751ae0c, + 0xf80318ed, + 0x1285c58f, + 0xfdd7736e, + 0x1cf9790a, + 0xf3abcfeb, + 0x192d1289, + 0xf67fa468, + }, + + { + 0x0, + 0x2ea35c18, + 0x5d46b830, + 0x73e5e428, + 0xba8d7060, + 0x942e2c78, + 0xe7cbc850, + 0xc9689448, + 0xae6be681, + 0x80c8ba99, + 0xf32d5eb1, + 0xdd8e02a9, + 0x14e696e1, + 0x3a45caf9, + 0x49a02ed1, + 0x670372c9, + }, + + { + 0x0, + 0x87a6cb43, + 0xd43c90c7, + 0x539a5b84, + 0x730827cf, + 0xf4aeec8c, + 0xa734b708, + 0x20927c4b, + 0xe6104f9e, + 0x61b684dd, + 0x322cdf59, + 0xb58a141a, + 0x95186851, + 0x12bea312, + 0x4124f896, + 0xc68233d5, + }, + + { + 0x0, + 0x1751997d, + 0x2ea332fa, + 0x39f2ab87, + 0x5d4665f4, + 0x4a17fc89, + 0x73e5570e, + 0x64b4ce73, + 0xba8ccbe8, + 0xaddd5295, + 0x942ff912, + 0x837e606f, + 0xe7caae1c, + 0xf09b3761, + 0xc9699ce6, + 0xde38059b, + }, + + { + 0x0, + 0xae689191, + 0x87a02563, + 0x29c8b4f2, + 0xd4314c87, + 0x7a59dd16, + 0x539169e4, + 0xfdf9f875, + 0x73139f4f, + 0xdd7b0ede, + 0xf4b3ba2c, + 0x5adb2bbd, + 0xa722d3c8, + 0x94a4259, + 0x2082f6ab, + 0x8eea673a, + }, + + { + 0x0, + 0xe6273e9e, + 0x173f7b7d, + 0xf11845e3, + 0x2e7ef6fa, + 0xc859c864, + 0x39418d87, + 0xdf66b319, + 0x5cfdedf4, + 0xbadad36a, + 0x4bc29689, + 0xade5a817, + 0x72831b0e, + 0x94a42590, + 0x65bc6073, + 0x839b5eed, + }, + + { + 0x0, + 0xb9fbdbe8, + 0xa886b191, + 0x117d6a79, + 0x8a7c6563, + 0x3387be8b, + 0x22fad4f2, + 0x9b010f1a, + 0xcf89cc87, + 0x7672176f, + 0x670f7d16, + 0xdef4a6fe, + 0x45f5a9e4, + 0xfc0e720c, + 0xed731875, + 0x5488c39d, + }, + + { + 0x0, + 0x44629f4f, + 0x88c53e9e, + 0xcca7a1d1, + 0xcafb7b7d, + 0x8e99e432, + 0x423e45e3, + 0x65cdaac, + 0x4e87f0bb, + 0xae56ff4, + 0xc642ce25, + 0x8220516a, + 0x847c8bc6, + 0xc01e1489, + 0xcb9b558, + 0x48db2a17, + }, + + { + 0x0, + 0x9d0fe176, + 0xe16ec4ad, + 0x7c6125db, + 0x19ac8f1b, + 0x84a36e6d, + 0xf8c24bb6, + 0x65cdaac0, + 0x33591e36, + 0xae56ff40, + 0xd237da9b, + 0x4f383bed, + 0x2af5912d, + 0xb7fa705b, + 0xcb9b5580, + 0x5694b4f6, + }, + + { + 0x0, + 0x66b23c6c, + 0xcd6478d8, + 0xabd644b4, + 0x41b9f7f1, + 0x270bcb9d, + 0x8cdd8f29, + 0xea6fb345, + 0x8373efe2, + 0xe5c1d38e, + 0x4e17973a, + 0x28a5ab56, + 0xc2ca1813, + 0xa478247f, + 0xfae60cb, + 0x691c5ca7, + }, + + { + 0x0, + 0xdd96d985, + 0x605cb54b, + 0xbdca6cce, + 0xc0b96a96, + 0x1d2fb313, + 0xa0e5dfdd, + 0x7d730658, + 0x5a03d36d, + 0x87950ae8, + 0x3a5f6626, + 0xe7c9bfa3, + 0x9abab9fb, + 0x472c607e, + 0xfae60cb0, + 0x2770d535, + }, + + { + 0x0, + 0xb407a6da, + 0xb37e4bf5, + 0x779ed2f, + 0xbd8d91ab, + 0x98a3771, + 0xef3da5e, + 0xbaf47c84, + 0xa06a2517, + 0x146d83cd, + 0x13146ee2, + 0xa713c838, + 0x1de7b4bc, + 0xa9e01266, + 0xae99ff49, + 0x1a9e5993, + }, + + { + 0x0, + 0x9ba54c6f, + 0xec3b9e9f, + 0x779ed2f0, + 0x3063b7f, + 0x98a37710, + 0xef3da5e0, + 0x7498e98f, + 0x60c76fe, + 0x9da93a91, + 0xea37e861, + 0x7192a40e, + 0x50a4d81, + 0x9eaf01ee, + 0xe931d31e, + 0x72949f71, + }, + + { + 0x0, + 0xc18edfc, + 0x1831dbf8, + 0x14293604, + 0x3063b7f0, + 0x3c7b5a0c, + 0x28526c08, + 0x244a81f4, + 0x60c76fe0, + 0x6cdf821c, + 0x78f6b418, + 0x74ee59e4, + 0x50a4d810, + 0x5cbc35ec, + 0x489503e8, + 0x448dee14, + }, + + { + 0x0, + 0xc18edfc0, + 0x586cb9c1, + 0x99e26601, + 0xb0d97382, + 0x7157ac42, + 0xe8b5ca43, + 0x293b1583, + 0xbac3e145, + 0x7b4d3e85, + 0xe2af5884, + 0x23218744, + 0xa1a92c7, + 0xcb944d07, + 0x52762b06, + 0x93f8f4c6, + }, + + { + 0x0, + 0xaef6c4cb, + 0x869c8fd7, + 0x286a4b1c, + 0xd64819ef, + 0x78bedd24, + 0x50d49638, + 0xfe2252f3, + 0x77e1359f, + 0xd917f154, + 0xf17dba48, + 0x5f8b7e83, + 0xa1a92c70, + 0xf5fe8bb, + 0x2735a3a7, + 0x89c3676c, + }, + + { + 0x0, + 0xefc26b3e, + 0x4f5d03d, + 0xeb37bb03, + 0x9eba07a, + 0xe629cb44, + 0xd1e7047, + 0xe2dc1b79, + 0x13d740f4, + 0xfc152bca, + 0x172290c9, + 0xf8e0fbf7, + 0x1a3ce08e, + 0xf5fe8bb0, + 0x1ec930b3, + 0xf10b5b8d, + }, + + { + 0x0, + 0x27ae81e8, + 0x4f5d03d0, + 0x68f38238, + 0x9eba07a0, + 0xb9148648, + 0xd1e70470, + 0xf6498598, + 0xe6050901, + 0xc1ab88e9, + 0xa9580ad1, + 0x8ef68b39, + 0x78bf0ea1, + 0x5f118f49, + 0x37e20d71, + 0x104c8c99, + }, + + { + 0x0, + 0x177b1443, + 0x2ef62886, + 0x398d3cc5, + 0x5dec510c, + 0x4a97454f, + 0x731a798a, + 0x64616dc9, + 0xbbd8a218, + 0xaca3b65b, + 0x952e8a9e, + 0x82559edd, + 0xe634f314, + 0xf14fe757, + 0xc8c2db92, + 0xdfb9cfd1, + }, + + { + 0x0, + 0xacc04271, + 0x82f182a3, + 0x2e31c0d2, + 0xde920307, + 0x72524176, + 0x5c6381a4, + 0xf0a3c3d5, + 0x6655004f, + 0xca95423e, + 0xe4a482ec, + 0x4864c09d, + 0xb8c70348, + 0x14074139, + 0x3a3681eb, + 0x96f6c39a, + }, + + { + 0x0, + 0xccaa009e, + 0x4225077d, + 0x8e8f07e3, + 0x844a0efa, + 0x48e00e64, + 0xc66f0987, + 0xac50919, + 0xd3e51bb5, + 0x1f4f1b2b, + 0x91c01cc8, + 0x5d6a1c56, + 0x57af154f, + 0x9b0515d1, + 0x158a1232, + 0xd92012ac, + }, + + { + 0x0, + 0x7cbb312b, + 0xf9766256, + 0x85cd537d, + 0x299dc2ed, + 0x5526f3c6, + 0xd0eba0bb, + 0xac509190, + 0x533b85da, + 0x2f80b4f1, + 0xaa4de78c, + 0xd6f6d6a7, + 0x7aa64737, + 0x61d761c, + 0x83d02561, + 0xff6b144a, + }, + + { + 0x0, + 0xa6770bb4, + 0x979f1129, + 0x31e81a9d, + 0xf44f2413, + 0x52382fa7, + 0x63d0353a, + 0xc5a73e8e, + 0x33ef4e67, + 0x959845d3, + 0xa4705f4e, + 0x20754fa, + 0xc7a06a74, + 0x61d761c0, + 0x503f7b5d, + 0xf64870e9, + }, + + { + 0x0, + 0x67de9cce, + 0xcfbd399c, + 0xa863a552, + 0x440b7579, + 0x23d5e9b7, + 0x8bb64ce5, + 0xec68d02b, + 0x8816eaf2, + 0xefc8763c, + 0x47abd36e, + 0x20754fa0, + 0xcc1d9f8b, + 0xabc30345, + 0x3a0a617, + 0x647e3ad9, + }, + + { + 0x0, + 0xcb5cd3a5, + 0x4dc8a10b, + 0x869472ae, + 0x9b914216, + 0x50cd91b3, + 0xd659e31d, + 0x1d0530b8, + 0xec53826d, + 0x270f51c8, + 0xa19b2366, + 0x6ac7f0c3, + 0x77c2c07b, + 0xbc9e13de, + 0x3a0a6170, + 0xf156b2d5, + }, + + { + 0x0, + 0x3d6029b, + 0x7ac0536, + 0x47a07ad, + 0xf580a6c, + 0xc8e08f7, + 0x8f40f5a, + 0xb220dc1, + 0x1eb014d8, + 0x1d661643, + 0x191c11ee, + 0x1aca1375, + 0x11e81eb4, + 0x123e1c2f, + 0x16441b82, + 0x15921919, + }, + + { + 0x0, + 0x3d6029b0, + 0x7ac05360, + 0x47a07ad0, + 0xf580a6c0, + 0xc8e08f70, + 0x8f40f5a0, + 0xb220dc10, + 0x30704bc1, + 0xd106271, + 0x4ab018a1, + 0x77d03111, + 0xc5f0ed01, + 0xf890c4b1, + 0xbf30be61, + 0x825097d1, + }, + + { + 0x0, + 0x60e09782, + 0xc1c12f04, + 0xa121b886, + 0x58f35849, + 0x3813cfcb, + 0x9932774d, + 0xf9d2e0cf, + 0xb1e6b092, + 0xd1062710, + 0x70279f96, + 0x10c70814, + 0xe915e8db, + 0x89f57f59, + 0x28d4c7df, + 0x4834505d, + }, + + { + 0x0, + 0xb8bc6765, + 0xaa09c88b, + 0x12b5afee, + 0x8f629757, + 0x37def032, + 0x256b5fdc, + 0x9dd738b9, + 0xc5b428ef, + 0x7d084f8a, + 0x6fbde064, + 0xd7018701, + 0x4ad6bfb8, + 0xf26ad8dd, + 0xe0df7733, + 0x58631056, + }, + + { + 0x0, + 0x5019579f, + 0xa032af3e, + 0xf02bf8a1, + 0x9b14583d, + 0xcb0d0fa2, + 0x3b26f703, + 0x6b3fa09c, + 0xed59b63b, + 0xbd40e1a4, + 0x4d6b1905, + 0x1d724e9a, + 0x764dee06, + 0x2654b999, + 0xd67f4138, + 0x866616a7, + }, + + { + 0x0, + 0x1c26a37, + 0x384d46e, + 0x246be59, + 0x709a8dc, + 0x6cbc2eb, + 0x48d7cb2, + 0x54f1685, + 0xe1351b8, + 0xfd13b8f, + 0xd9785d6, + 0xc55efe1, + 0x91af964, + 0x8d89353, + 0xa9e2d0a, + 0xb5c473d, + }, + + { + 0x0, + 0x1c26a370, + 0x384d46e0, + 0x246be590, + 0x709a8dc0, + 0x6cbc2eb0, + 0x48d7cb20, + 0x54f16850, + 0xe1351b80, + 0xfd13b8f0, + 0xd9785d60, + 0xc55efe10, + 0x91af9640, + 0x8d893530, + 0xa9e2d0a0, + 0xb5c473d0, + }, + + { + 0x0, + 0x191b3141, + 0x32366282, + 0x2b2d53c3, + 0x646cc504, + 0x7d77f445, + 0x565aa786, + 0x4f4196c7, + 0xc8d98a08, + 0xd1c2bb49, + 0xfaefe88a, + 0xe3f4d9cb, + 0xacb54f0c, + 0xb5ae7e4d, + 0x9e832d8e, + 0x87981ccf, + }, + + { + 0x0, + 0x4ac21251, + 0x958424a2, + 0xdf4636f3, + 0xf0794f05, + 0xbabb5d54, + 0x65fd6ba7, + 0x2f3f79f6, + 0x3b83984b, + 0x71418a1a, + 0xae07bce9, + 0xe4c5aeb8, + 0xcbfad74e, + 0x8138c51f, + 0x5e7ef3ec, + 0x14bce1bd, + }, + + { + 0x0, + 0x77073096, + 0xee0e612c, + 0x990951ba, + 0x76dc419, + 0x706af48f, + 0xe963a535, + 0x9e6495a3, + 0xedb8832, + 0x79dcb8a4, + 0xe0d5e91e, + 0x97d2d988, + 0x9b64c2b, + 0x7eb17cbd, + 0xe7b82d07, + 0x90bf1d91, + }, + + { + 0x0, + 0x1db71064, + 0x3b6e20c8, + 0x26d930ac, + 0x76dc4190, + 0x6b6b51f4, + 0x4db26158, + 0x5005713c, + 0xedb88320, + 0xf00f9344, + 0xd6d6a3e8, + 0xcb61b38c, + 0x9b64c2b0, + 0x86d3d2d4, + 0xa00ae278, + 0xbdbdf21c, + }, + }; + + const int num_nibbles_parallel = 64; + + const int num_sections = accessor_isz / (num_nibbles_parallel / + 2); // how many loop iterations + unsigned int result = ~0; + + for (int i = 0; i < num_sections; i++) { + unsigned int result_update_odd = 0; + unsigned int result_update_even = 0; +// which 4 bit chunk within the section -- this loop can be unrolled, the +// total update for the crc is the xor of the updates from the nibbles + #pragma unroll + for (int nib = 0; nib < num_nibbles_parallel; nib++) { + unsigned char this_input_nibble = + (acc_pibuf[(i * num_nibbles_parallel + nib) / 2] >> + (4 * (nib % 2))); + unsigned char this_result_nibble = + (nib < 8) ? (result >> (4 * nib)) : 0; + unsigned char this_table_index = + this_input_nibble ^ this_result_nibble; + if (nib % 2) { + result_update_odd ^= table64[nib][this_table_index & 0xf]; + } else { + result_update_even ^= table64[nib][this_table_index & 0xf]; + } + } + result = result_update_odd ^ result_update_even; + } + + accresult_crc[0] = ~result; + }); + }); + + e_lz = q.submit([&](handler &h) { + auto accessor_isz = block_size; + auto acc_pibuf = pibuf->get_access(h); + + h.single_task>([=]() [[intel::kernel_args_restrict]] { + //------------------------------------- + // Hash Table(s) + //------------------------------------- + + [[intelfpga::singlepump]] [[intelfpga::numbanks(kVec)]] [ + [intelfpga::max_replicates(kVec)]] struct { + unsigned char s[kLen]; + } dictionary[kDepth][kVec]; + + [[intelfpga::singlepump]] [[intelfpga::numbanks(kVec)]] [ + [intelfpga::max_replicates( + kVec)]] unsigned int dict_offset[kDepth][kVec]; + + // Initialize history to empty. + for (int i = 0; i < kDepth; i++) { + Unroller<0, kVec>::step([&](int k) { dict_offset[i][k] = 0; }); + } + + // This is the window of data on which we look for matches + // We fetch twice our data size because we have kVec offsets + unsigned char current_window[kVecX2]; + + // This is the window of data on which we look for matches + // We fetch twice our data size because we have kVec offsets + unsigned char compare_window[kLen][kVec][kVec]; + // kVec bytes per dict----------| | | + // kVec dictionaries-----------------| | + // one for each curr win offset---------| + + // load offset into these arrays + unsigned int compare_offset[kVec][kVec]; + // one per kVec bytes----------| | + // one for each compwin-------------| + + // Initialize input stream position + unsigned int inpos_minus_vec_div_16 = 0; + + // this is ceiling of (insize-kVec)/16, original comparison was + // inpos < insize, now inpos is carried as (inpos-kVec)/16 so this is what + // we compare to + unsigned int insize_compare = (accessor_isz) / kVec; + + int ctr = insize_compare = insize_compare - 1; + + char first_valid_pos = 0; + + struct DistLen dist_offs_data; + + int distchan_ndx = 0; + size_t inpos = 0; + + // load in new data + struct LzInput in; + Unroller<0, kVec>::step([&](int i) { in.data[i] = acc_pibuf[inpos++]; }); + + Unroller<0, kVec>::step( + [&](int i) { current_window[i + kVec] = in.data[i]; }); + + do { + //----------------------------- + // Prepare current window + //----------------------------- + + // shift current window + Unroller<0, kVec>::step( + [&](int i) { current_window[i] = current_window[i + kVec]; }); + + // load in new data + Unroller<0, kVec>::step( + [&](int i) { in.data[i] = acc_pibuf[inpos++]; }); + + Unroller<0, kVec>::step( + [&](int i) { current_window[kVec + i] = in.data[i]; }); + + //----------------------------- + // Compute hash + //----------------------------- + + unsigned short hash[kVec]; + + Unroller<0, kVec>::step([&](int i) { + hash[i] = (current_window[i] ^ (current_window[i + 1] << 6) ^ + (current_window[i + 2] << 2) ^ current_window[i + 3]) & + kHashMask; + }); + + //----------------------------- + // Dictionary look-up + //----------------------------- + + // loop over kVec compare windows, each has a different hash + Unroller<0, kVec>::step([&](int i) { + // loop over all kVec bytes + Unroller<0, kLen>::step([&](int j) { + Unroller<0, kVec>::step([&](int k) { + compare_window[k][j][i] = dictionary[hash[i]][j].s[k]; + }); + }); + }); + + // loop over compare windows + Unroller<0, kVec>::step([&](int i) { + Unroller<0, kLen>::step([&](int j) { + // loop over frames in this compare window + // (they come from different dictionaries) + compare_offset[j][i] = dict_offset[hash[i]][j]; + }); + }); + + //----------------------------- + // Dictionary update + //----------------------------- + + // loop over different dictionaries to store different frames + // store one frame per dictionary + // loop over kVec bytes to store + Unroller<0, kLen>::step([&](int i) { + Unroller<0, kVec>::step([&](int j) { + // store actual bytes + dictionary[hash[i]][i].s[j] = current_window[i + j]; + }); + }); + + Unroller<0, kVec>::step([&](int i) { + // loop over kVec different dictionaries and write one word to each + dict_offset[hash[i]][i] = + (inpos_minus_vec_div_16 << 4) | + i; // inpos - kVec + 0, we know that inpos - kVec has 0 as the 4 + // lower bits so really just concatenate + }); + + //----------------------------- + // Match search + //----------------------------- + + // arrays to store length, best length etc.. + unsigned char length[kVec]; + bool done[kVec]; + char best_length[kVec]; + unsigned int best_offset[kVec]; + + // initialize best_length + Unroller<0, kVec>::step([&](int i) { + best_length[i] = 0; + best_offset[i] = 0; + }); + + // loop over each comparison window frame + // one comes from each dictionary + Unroller<0, kVec>::step([&](int i) { + // initialize length and done + Unroller<0, kVec>::step([&](int l) { + length[l] = 0; + done[l] = 0; + }); + + // loop over each current window + Unroller<0, kVec>::step([&](int j) { + // loop over each char in the current window + // and corresponding char in comparison window + Unroller<0, kLen>::step([&](int k) { + bool comp = + current_window[k + j] == compare_window[k][i][j] && !done[j]; + length[j] += comp; + done[j] = !comp; + }); + }); + + // Check if this the best length + Unroller<0, kVec>::step([&](int m) { + bool update_best = + (length[m] > best_length[m]) && (compare_offset[i][m] != 0) && + (((inpos_minus_vec_div_16 << kVecPow) | (i & (kVec - 1))) - + (compare_offset[i][m]) < + kMaxDistance); + + unsigned int new_offset = + (((inpos_minus_vec_div_16 << kVecPow) | (m & (kVec - 1))) & + 0x7ffff) - + ((compare_offset[i][m] & 0x7ffff)); + + // Reconsider if new_offset is bigger than current offset, might + // take more bytes to encode + update_best = update_best && (length[m] == best_length[m]) && + (new_offset > best_offset[m]) + ? false + : update_best; + + best_offset[m] = (update_best ? new_offset : best_offset[m]) & + 0x7ffff; // 19 bits is sufficient + + best_length[m] = (update_best ? length[m] : best_length[m]) & + 0x1f; // 5 bits is sufficient + }); + }); + + //----------------------------- + // Filter matches step 1 + //----------------------------- + + // remove matches with offsets that are <= 0: this means they're + // self-matching or didn't match and keep only the matches that, when + // encoded, take fewer bytes than the actual match length + Unroller<0, kVec>::step([&](int i) { + best_length[i] = (((best_length[i] & 0x1f) >= 3) && + ((best_offset[i]) < kMaxDistance) + ? best_length[i] + : 0) & + 0x1f; // 5 bits is sufficient + + // Second level filter - remove matches with len 3, greater than + // kTooFar + best_length[i] = + (((best_length[i] & 0x1f) == 3) && ((best_offset[i]) > kTooFar) + ? 0 + : best_length[i]) & + 0x1f; // 5 bits is sufficient + // don't emmit matches for last iteration as some of the + // second part of the window might be undefined + if (ctr == 0) best_length[i] = 0; + }); + + //----------------------------- + // Assign first_valid_pos + //----------------------------- + + // first_valid_pos is loop-carried, and tricky to compute. So first + // compute it speculatively in parallel for every possible value of the + // previous first_valid_pos. + char first_valid_pos_speculative[kVec]; + + Unroller<0, kVec>::step([&](int guess) { + unsigned char next_match_search = guess; + Unroller<0, kVec>::step([&](int i) { + unsigned int len = best_length[i]; + + // Skip to the next match + next_match_search = + i >= next_match_search && len > 0 ? i + len : next_match_search; + }); + + first_valid_pos_speculative[guess] = + next_match_search - kVec > 0 ? next_match_search - kVec : 0; + }); + + // For kVec=16 (the largest currently supported), this should be a 16:1 + // mux, which is 2 6LUTs deep. For larger kVec, it will be worse. + unsigned char current_valid_pos = first_valid_pos; + first_valid_pos = + first_valid_pos_speculative[first_valid_pos & (kVec - 1)] & + (kVec - + 1); // first_valid_pos only needs 4 bits, make this explicit + + // greedy match selection + Unroller<0, (kVec)>::step([&](int i) { + unsigned int len = best_length[i]; + best_length[i] = i < current_valid_pos ? -1 : best_length[i]; + // Skip to the next match + current_valid_pos = + i >= current_valid_pos && len > 0 ? i + len : current_valid_pos; + }); + + //----------------------------- + // Setup LZ dist/len pairs to push to Huffman encode kernel + //----------------------------- + + Unroller<0, kVec>::step([&](int i) { + dist_offs_data.data[i] = 0; + dist_offs_data.len[i] = -1; + dist_offs_data.dist[i] = -1; + if (best_length[i] >= 0) { + dist_offs_data.data[i] = current_window[i]; + dist_offs_data.len[i] = best_length[i]; + dist_offs_data.dist[i] = best_offset[i]; + } + }); + + acc_dist_channel::write(dist_offs_data); + + // increment input position + inpos_minus_vec_div_16++; + distchan_ndx += 1; + ctr--; + + } while (ctr >= 0); + + const char lasti = accessor_isz - (accessor_isz & ~(kVec - 1)); + const char firstpos = first_valid_pos; + Unroller<0, kVec>::step([&](unsigned char i) { + dist_offs_data.data[i] = 0; + dist_offs_data.len[i] = -1; + dist_offs_data.dist[i] = -1; + }); + + Unroller<0, kVec>::step([&](unsigned char i) { + bool pred = + ((i - firstpos) < (lasti - firstpos)) && ((i - firstpos) >= 0); + dist_offs_data.data[i] = pred ? current_window[i + kVec] : 0; + dist_offs_data.len[i] = pred ? 0 : -1; + }); + + acc_dist_channel_last::write(dist_offs_data); + }); + }); + + e_huff = q.submit([&](handler &h) { + auto accessor_isz = block_size; + auto acc_gzip_out = + gzip_out_buf->get_access(h); + auto accessor_output = pobuf->get_access(h); + auto acc_eof = last_block ? 1 : 0; + h.single_task>([= + ]() [[intel::kernel_args_restrict]] { + unsigned int leftover[kVec] = {0}; + Unroller<0, kVec>::step([&](int i) { leftover[i] = 0; }); + + unsigned short leftover_size = 0; + + unsigned int outpos_huffman = 0; + + int ctr = ((accessor_isz) / kVec) + 2; + int odx = 0; + + // Add the gzip start block marker. Assumes static huffman trees. + leftover_size = 3; + leftover[0] = ((kStaticTrees << 1) + (acc_eof)); + + do { + struct DistLen in; + // init the input structure for the gzip end block marker. + // this is the very last data block to be encoded and written. + Unroller<0, kVec>::step([&](int i) { + in.len[i] = -1; + in.dist[i] = -1; + in.data[i] = 0; + }); + in.len[0] = ctr == 1 ? -3 : -1; + in.data[0] = 0; + + in = ctr > 2 ? acc_dist_channel::read() + : (ctr == 2 ? acc_dist_channel_last::read() : in); + + struct HuffmanOutput outdata; + outdata.write = HufEnc(in.len, in.dist, in.data, outdata.data, leftover, + &leftover_size); + + // prevent out of bounds write + if (((ctr == 0) || outdata.write) && (odx < accessor_isz)) { + Unroller<0, kVec * sizeof(unsigned int)>::step([&](int i) { + accessor_output[odx + i] = + (ctr == 0) ? (unsigned char)(leftover[(i >> 2) & 0xf] >> + ((i & 3) << 3)) + : (unsigned char)(outdata.data[(i >> 2) & 0xf] >> + ((i & 3) << 3)); + }); + } + + outpos_huffman = outdata.write ? outpos_huffman + 1 : outpos_huffman; + odx += outdata.write ? (sizeof(unsigned int) << kVecPow) : 0; + + } while (ctr--); + + // Store summary values from lz and huffman + acc_gzip_out[0].compression_sz = + (outpos_huffman * sizeof(unsigned int) * kVec) + + (leftover_size + 7) / 8; + }); + }); +} + +void SubmitGzipTasks(queue &q, + size_t block_size, // size of block to compress. + buffer *pibuf, buffer *pobuf, + buffer *gzip_out_buf, + buffer *result_crc, bool last_block, + event &e_crc, event &e_lz, event &e_huff, + size_t engineID) { + // Statically declare the engines so that the hardware is created for them. + // But at run time, the host can dynamically select which engine(s) to use via + // engineID. + if (engineID == 0) { + SubmitGzipTasksSingleEngine<0>(q, block_size, pibuf, pobuf, gzip_out_buf, + result_crc, last_block, e_crc, e_lz, e_huff); + } + + #if NUM_ENGINES > 1 + if (engineID == 1) { + SubmitGzipTasksSingleEngine<1>(q, block_size, pibuf, pobuf, gzip_out_buf, + result_crc, last_block, e_crc, e_lz, e_huff); + } + #endif + + // If this reference design is to be expanded to > 2 engines, declare them here. + +} diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.hpp new file mode 100755 index 0000000000..7de9a3ea17 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.hpp @@ -0,0 +1,45 @@ +// ============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of +// California and by the laws of the United States of America. + +#ifndef __GZIPKERNEL_H__ +#define __GZIPKERNEL_H__ +#pragma once + +#include + +using namespace cl::sycl; + +extern "C" void SubmitGzipTasks( + queue &sycl_device, + size_t block_size, // size of block to compress. + buffer *pibuf, buffer *pobuf, + buffer *gzip_out_buf, + buffer *current_crc, bool last_block, event &e_crc, + event &e_lz, event &e_huff, size_t engineID); + +#endif //__GZIPKERNEL_H__ diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/kernels.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/kernels.hpp new file mode 100755 index 0000000000..65f207bab7 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/kernels.hpp @@ -0,0 +1,148 @@ +// ============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of +// California and by the laws of the United States of America. + +#ifndef __KERNELS_H__ +#define __KERNELS_H__ +#pragma once + +#ifndef NUM_ENGINES + #define NUM_ENGINES 1 +#endif + +constexpr int kNumEngines = NUM_ENGINES; + +// kVecPow == 2 means kVec == 4. +// kVecPow == 3 means kVec == 8. +// kVecPow == 4 means kVec == 16. +constexpr int kVecPow = 4; + +constexpr int kVec = 1 << kVecPow; +constexpr int kVecX2 = 2 * kVec; + +constexpr int kHufTableSize = 256; + +// Maximum length of huffman codes +constexpr int kMaxHuffcodeBits = 16; + +struct Uint2Gzip { + unsigned int y; + unsigned int x; +}; + +struct LzInput { + unsigned char data[kVec]; +}; + +typedef struct DistLen { + unsigned char data[kVec]; + char len[kVec]; + short dist[kVec]; +} DistLen, *pdist_len_t; + +struct HuffmanOutput { + unsigned int data[kVec]; + bool write; +}; + +struct TrailingOutput { + int bytecount_left; + int bytecount; + unsigned char bytes[kVec * sizeof(unsigned int)]; +}; + +struct GzipOutInfo { + // final compressed block size + size_t compression_sz; + unsigned long crc; +}; + +// kLen must be == kVec +constexpr int kLen = kVec; + +// depth of the dictionary buffers +constexpr int kDepth = 512; + +// Assumes kDepth is a power of 2 number. +constexpr int kHashMask = kDepth - 1; + +#define CONSTANT __constant + +constexpr int kDebug = 1; +#define TRACE(x) \ + do { \ + if (kDebug) printf x; \ + } while (0) + +constexpr int kStaticTrees = 1; + +typedef struct CtData { + unsigned short code; + unsigned short len; +} CtData; + +constexpr int kMaxMatch = 258; +constexpr int kMinMatch = 3; + +constexpr int kTooFar = 4096; + +// All codes must not exceed kMaxBits +constexpr int kMaxBits = 15; + +// number of length codes, not counting the special kEndBlock code +constexpr int kLengthCodes = 29; + +// number of literal bytes, 0..255 +constexpr int kLiterals = 256; + +// end of literal code block +constexpr int kEndBlock = 256; + +// number of literal or length codes, including kEndBlock +constexpr int kLCodes = (kLiterals + 1 + kLengthCodes); + +// number of distance codes +constexpr int kDCodes = 30; + +// number of codes used to transfer the bit lengths +constexpr int kBLCodes = 19; + +constexpr int kMaxDistance = ((32 * 1024)); + +constexpr int kMinBufferSize = 16384; + +struct DictString { + unsigned char s[kLen]; +}; + +// Mapping from a distance to a distance code. dist is the distance - 1 and +// must not have side effects. dist_code[256] and dist_code[257] are never +// used. +#define d_code(dist) \ + ((dist) < 256 ? dist_code[dist] : dist_code[256 + ((dist) >> 7)]) + +#endif //__KERNELS_H__ diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/CMakeLists.txt new file mode 100755 index 0000000000..81cd1c747a --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/CMakeLists.txt @@ -0,0 +1,12 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + + +cmake_minimum_required (VERSION 2.8) + +project(QRD) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/README.md b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/README.md new file mode 100755 index 0000000000..34288260cf --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/README.md @@ -0,0 +1,239 @@ +# QR Decomposition of Matrices +This DPC++ reference design demonstrates high-performance QR decomposition of complex matrices on FPGA. + +***Documentation***: The [FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a resource for general target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® PAC with Intel Stratix® 10 SX FPGA;
Intel Xeon® CPU E5-1650 v2 @ 3.50GHz (host machine) +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | Implementing a high performance FPGA version of the Gram-Schmidt QR decomposition algorithm. +| Time to complete | 1 hr (not including compile time) + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + + +**Performance** +Please refer to performance disclaimer at the end of this README. + +| Device | Throughput +|:--- |:--- +| Intel® PAC with Intel Arria® 10 GX FPGA | 25k matrices/s for matrices of size 128 * 128 +| Intel® PAC with Intel Stratix® 10 SX FPGA | 7k matrices/s for matrices of size 256 * 256 + + +## Purpose + +This FPGA reference design demonstrates QR decomposition of matrices of complex numbers, a common operation employed in linear algebra. Matrix _A_ (input) is decomposed into a product of an orthogonal matrix _Q_ and an upper triangular matrix _R_. + +The algorithms employed by the reference design are the Gram-Schmidt QR decomposition algorithm and the thin QR factorization method. Background information on these algorithms can be found in Wikipedia's [QR decomposition](https://en.wikipedia.org/wiki/QR_decomposition) article. The original algorithm has been modified and optimized for performance on FPGAs in this implementation. + +QR decomposition is used extensively in signal processing applications such as beamforming, multiple-input multiple-output (MIMO) processing, and Space Time Adaptive Processing (STAP). + + +### Matrix dimensions and FPGA resources + +The QR decomposition algorithm factors a complex _m_×_n_ matrix, where _m_ ≥ _n_. The algorithm computes the vector dot product of two columns of the matrix. In our FPGA implementation, the dot product is computed in a loop over the _m_ elements of the column. The loop is fully unrolled to maximize throughput. As a result, *m* complex multiplication operations are performed in parallel on the FPGA, followed by sequential additions to compute the dot product result. + +We use the compiler flag `-fp-relaxed`, which permits the compiler to reorder floating point additions (i.e. to assume that floating point addition is commutative). The compiler uses this freedom to reorder the additions so that the dot product arithmetic can be optimally implemented using the FPGA's specialized floating point DSP (Digital Signal Processing) hardware. + +With this optimization, our FPGA implementation requires 4*m* DSPs to compute the complex floating point dot product. Thus, the matrix size is constrained by the total FPGA DSP resources available. Note that this upper bound is a consequence of this particular implementation. + +By default, the design is parameterized to process 128 × 128 matrices when compiled targeting Intel® PAC with Intel Arria® 10 GX FPGA. It is parameterized to process 256 × 256 matrices when compiled targeting Intel® PAC with Intel Stratix® 10 SX FPGA, a larger device. + + +## Key Implementation Details +| Kernel | Description +--- |--- +| QRD | Implements a modified Gram-Schmidt QR decomposition algorithm. + +To optimize the performance-critical loop in its algorithm, the design leverages concepts discussed in the following FPGA tutorials: +* **Triangular Loop Optimization** (triangular_loop) +* **Explicit Pipelining with `fpga_reg`** (fpga_register) +* **Loop `ivdep` Attribute** (loop_ivdep) +* **Unrolling Loops** (loop_unroll) + + The key optimization techniques used are as follows: + 1. Refactoring the algorithm to merge two dot products into one, reducing the total number of dot products needed to three from two. This helps us reduce the DSPs needed for the implementation. + 2. Converting the nested loop into a single merged loop and applying Triangular Loop optimizations. This allows us to generate a design that is very well pipelined. + 3. Fully vectorizing the dot products using loop unrolling. + 4. Using the compiler flag -Xsfp-relaxed to re-order floating point operations and allowing the inference of a specialised dot-product DSP. This further reduces the number of DSP blocks needed by the implementation, the overall latency, and pipeline depth. + 5. Using an efficient memory banking scheme to generate high performance hardware. + 6. Using the `fpga_reg` attribute to insert more pipeline stages where needed to improve the frequency achieved by the design. + +## License +This code sample is licensed under MIT license. + +## Building the Reference Design + +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Code Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 24h. + +### On a Linux* System +1. Install the design into a directory `build` from the design directory by running `cmake`: + + ``` + mkdir build + cd build + ``` + + If you are compiling for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + + ``` + cmake .. + ``` + + If instead you are compiling for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following targets are provided and they match the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device). + + ``` + make fpga_emu + ``` + + * Generate HTML performance report. Find the report in `qrd_report.prj/reports/report.html`directory. + + ``` + make report + ``` + + * Compile for FPGA hardware (longer compile time, targets FPGA device). + + ``` + make fpga + ``` + +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + +### On a Windows* System +Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. + +Note: Ensure that Microsoft Visual Studio* (2017, or 2019 Version 16.4 or newer) with "Desktop development with C++" workload is installed on your system. + +1. Enter source file directory. + +``` +cd src +``` + +2. Compile the design. The following targets are provided and they match the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device). + + ``` + ninja fpga_emu + ``` + + * Generate HTML performance report. Find the report in `../src/qrd_report.prj/reports/report.html`directory. + + ``` + ninja report + ``` + + If you are targeting the Intel® PAC with Intel Stratix® 10 SX FPGA, please use the following target and find the report in `../src/qrd_s10_pac_report.prj/reports/report.html`. + + ``` + ninja report_s10_pac + ``` + + * **Not supported yet:** Compile and run on an FPGA hardware. + +### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this Reference Design in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Running the Reference Design +You can apply QR decomposition to a number of matrices as shown below. This step performs the following: +* Generates the number of random matrices specified as the command line argument (defaults to 1). +* Computes QR decomposition on all matrices. +* Evaluates performance. +NOTE: The design is optimized to perform best when run on a large number of matrices, where the total number of matrices is a power of 2. + + + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU). + ``` + ./qrd.fpga_emu (Linux) + qrd.fpga_emu.exe (Windows) + ``` + +2. Run the sample on the FPGA device. It is recommended to pass in an optional argument (as shown) when invoking the sample on hardware. Otherwise, the performance will not be representative. + ``` + ./qrd.fpga 40960 (Linux) + ``` +### Application Parameters + +| Argument | Description +--- |--- +| `` | Optional argument that specifies the number of matrices to decompose. Its default value is `1`. + +### Example of Output + +Example output when running on Intel® PAC with Intel Arria® 10 GX FPGA for 32768 matrices (each of consisting of 128*128 complex numbers): + +``` +Device name: pac_a10 : Intel PAC Platform (pac_f000000) +Generating 32768 random matrices +Running QR decomposition of 32768 matrices repeatedly + Total duration: 41.3763 s +Throughput: 25.3425k matrices/s +Verifying results on matrix 0 16384 32767 +PASSED +``` + +Example output when running on Intel® PAC with Intel Stratix® 10 SX FPGA for 40960 matrices (each of consisting of 256*256 complex numbers): + +``` +Device name: pac_s10 : Intel PAC Platform (pac_f100000) +Generating 4096 random matrices +Running QR decomposition of 4096 matrices repeatedly + Total duration: 17.3197 s +Throughput: 7.5678k matrices/s +Verifying results on matrix 0 2048 4095 +PASSED +``` + +## Additional Design Information + +### Compiler Flags Used + +| Flag | Description +--- |--- +`-Xshardware` | Target FPGA hardware (as opposed to FPGA emulator) +`-Xsclock=330MHz` | The FPGA backend attempts to achieve 330 MHz +`-Xsfp-relaxed` | Allows the FPGA backend to re-order floating point arithmetic operations (e.g. permit assuming (a + b + c) == (c + a + b) ) +`-Xsparallel=2` | Use 2 cores when compiling the bitstream through Quartus +`-Xsseed` | Specifies the Quartus compile seed, to yield slightly higher fmax +`-DROWS_COMPONENT` | Specifies the number of rows of the matrix +`-DCOLS_COMPONENT` | Specifies the number of columns of the matrix +`-DFIXED_ITERATIONS` | Used to set the ivdep safelen attribute for the performance critical triangular loop + +NOTE: The values for `seed`, `FIXED_ITERATIONS`, `ROWS_COMPONENT`, `COLS_COMPONENT` are set according to the board being targeted. + +### Performance disclaimers + +Tests document performance of components on a particular test, in specific systems. Differences in hardware, software, or configuration will affect actual performance. Consult other sources of information to evaluate performance as you consider your purchase. For more complete information about performance and benchmark results, visit [www.intel.com/benchmarks](www.intel.com/benchmarks). + +Performance results are based on testing as of July 29, 2020 and may not reflect all publicly available security updates. See configuration disclosure for details. No product or component can be absolutely secure. + +Intel technologies’ features and benefits depend on system configuration and may require enabled hardware, software or service activation. Performance varies depending on system configuration. Check with your system manufacturer or retailer or learn more at [intel.com](www.intel.com). + +The performance was measured by Intel on July 29, 2020. + +Intel and the Intel logo are trademarks of Intel Corporation or its subsidiaries in the U.S. and/or other countries. + +(C) Intel Corporation. + + diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.sln b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.sln new file mode 100755 index 0000000000..b5e086d1f5 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "qrd", "qrd.vcxproj", "{ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Debug|x64.ActiveCfg = Debug|x64 + {ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Debug|x64.Build.0 = Debug|x64 + {ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Release|x64.ActiveCfg = Release|x64 + {ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {97D1BD74-AAAB-4835-8F00-37A58B70871A} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.vcxproj b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.vcxproj new file mode 100755 index 0000000000..95a7067c03 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.vcxproj @@ -0,0 +1,170 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + + + + + 15.0 + {acde6b7a-6f9a-428e-b040-cedc5b1e2c79} + Win32Proj + qrd + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR -DFIXED_ITERATIONS=64 -DROWS_COMPONENT=128 -DCOLS_COMPONENT=128 %(AdditionalOptions) + + + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + -Xsclock=330MHz;-Xsfp-relaxed;-Xsparallel=2 + + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR -DFIXED_ITERATIONS=64 -DROWS_COMPONENT=128 -DCOLS_COMPONENT=128 %(AdditionalOptions) + + + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + -Xsclock=330MHz;-Xsfp-relaxed;-Xsparallel=2 + + + + + + + diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/sample.json b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/sample.json new file mode 100755 index 0000000000..aa107a266e --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/sample.json @@ -0,0 +1,57 @@ +{ + "guid": "3228581F-9DF8-4696-9B1C-0B31286B97C3", + "name": "QR Decomposition of Matrices on FPGA", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Reference Designs"], + "description": "Reference design demonstrating high-performance QR decomposition of complex matrices on FPGA", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "builder": ["ide", "cmake"], + "targetDevice": ["FPGA"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "env": [ + "export CL_CONFIG_CPU_FORCE_PRIVATE_MEM_SIZE=32MB" + ], + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./qrd.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "env": [ + "set CL_CONFIG_CPU_FORCE_PRIVATE_MEM_SIZE=32MB" + ], + "steps": [ + "cd src", + "ninja fpga_emu", + "qrd.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/CMakeLists.txt new file mode 100755 index 0000000000..5003e6a357 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/CMakeLists.txt @@ -0,0 +1,129 @@ +set(DEVICE_SOURCE_FILE qrd.cpp) +set(DEVICE_HEADER_FILE qrd.hpp) +set(HOST_SOURCE_FILE qrd_demo.cpp) +set(TARGET_NAME qrd) + +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) +set(REPORTS_TARGET ${TARGET_NAME}_report) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Design specific constant values +set(ROWS_COMPONENT_A10 128) +set(COLS_COMPONENT_A10 128) + +set(ROWS_COMPONENT_S10 256) +set(COLS_COMPONENT_S10 256) + +set(FIXED_ITERATIONS_A10 64) +set(FIXED_ITERATIONS_S10 105) + +set(SEED_A10 5) +set(SEED_S10 1) + +# Set parameter values assuming target is Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) +SET(FIXED_ITERATIONS ${FIXED_ITERATIONS_A10}) +SET(SEED ${SEED_A10}) +SET(ROWS_COMPONENT ${ROWS_COMPONENT_A10}) +SET(COLS_COMPONENT ${COLS_COMPONENT_A10}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + SET(FIXED_ITERATIONS ${FIXED_ITERATIONS_S10}) + SET(SEED ${SEED_S10}) + SET(ROWS_COMPONENT ${ROWS_COMPONENT_S10}) + SET(COLS_COMPONENT ${COLS_COMPONENT_S10}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS -fintelfpga -c -DFIXED_ITERATIONS=${FIXED_ITERATIONS} -DROWS_COMPONENT=${ROWS_COMPONENT} -DCOLS_COMPONENT=${COLS_COMPONENT}) + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +separate_arguments(USER_HARDWARE_FLAGS) +set(HARDWARE_LINK_FLAGS -fintelfpga -Xshardware -Xsclock=330MHz -Xsfp-relaxed -Xsparallel=2 -Xsseed=${SEED} -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS} -DFIXED_ITERATIONS=${FIXED_ITERATIONS} -DROWS_COMPONENT=${ROWS_COMPONENT} -DCOLS_COMPONENT=${COLS_COMPONENT}) +set(FINAL_LINK_FLAGS -fintelfpga -DFIXED_ITERATIONS=${FIXED_ITERATIONS} -DROWS_COMPONENT=${ROWS_COMPONENT} -DCOLS_COMPONENT=${COLS_COMPONENT}) + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR -DFIXED_ITERATIONS=${FIXED_ITERATIONS} -DROWS_COMPONENT=${ROWS_COMPONENT} -DCOLS_COMPONENT=${COLS_COMPONENT}") +set(EMULATOR_LINK_FLAGS -fintelfpga ) + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} ${CMAKE_CURRENT_SOURCE_DIR}/${HOST_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${DEVICE_SOURCE_FILE} ${HOST_SOURCE_FILE}) +else() + add_executable(${EMULATOR_TARGET} ${DEVICE_SOURCE_FILE} ${HOST_SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + set(DEVICE_FPGA_OBJ "qrd_fpga.o") + set(DEVICE_IMAGE_FPGA_OBJ "qrd_fpga.a") + set(HOST_FPGA_OBJ "qrd_host.o") + + add_custom_command(OUTPUT ${DEVICE_FPGA_OBJ} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${DEVICE_FPGA_OBJ} + DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE}) + + add_custom_command(OUTPUT ${HOST_FPGA_OBJ} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${HOST_SOURCE_FILE} -o ${HOST_FPGA_OBJ} + DEPENDS ${HOST_SOURCE_FILE}) + + add_custom_command(OUTPUT ${DEVICE_IMAGE_FPGA_OBJ} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link=image ${DEVICE_FPGA_OBJ} -o ${DEVICE_IMAGE_FPGA_OBJ} + DEPENDS ${DEVICE_FPGA_OBJ}) + + add_custom_command(OUTPUT ${FPGA_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${FINAL_LINK_FLAGS} ${HOST_FPGA_OBJ} ${DEVICE_IMAGE_FPGA_OBJ} -o ${CMAKE_BINARY_DIR}/${FPGA_TARGET} + DEPENDS ${DEVICE_IMAGE_FPGA_OBJ} ${HOST_FPGA_OBJ}) +endif() + +# fpga report +if(WIN32) + add_custom_target(report DEPENDS ${REPORTS_TARGET} ) + + separate_arguments(WIN_FLAGS WINDOWS_COMMAND) + add_custom_command(OUTPUT ${REPORTS_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${WIN_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET} + DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE}) + +else() + add_custom_target(report DEPENDS ${REPORTS_TARGET} ) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} ${DEVICE_SOURCE_FILE} COPYONLY) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_HEADER_FILE} ${DEVICE_HEADER_FILE} COPYONLY) + + add_custom_command(OUTPUT ${REPORTS_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${DEVICE_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET} + DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) + diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/build.ninja b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/build.ninja new file mode 100755 index 0000000000..619923b204 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/build.ninja @@ -0,0 +1,32 @@ +device_source_file = qrd.cpp +device_header_file = qrd.hpp +host_source_file = qrd_demo.cpp +target_name = qrd + +emulator_target = ${target_name}.fpga_emu.exe +report_target = ${target_name}_report.a +report_target_s10_pac = ${target_name}_s10_pac_report.a + +hardware_flags = -fintelfpga -Xshardware -Xsclock=330MHz -Xsfp-relaxed -Xsparallel=2 +emulator_flags = -fintelfpga -DFPGA_EMULATOR -Xsfast-emulator + +rule build_fpga_emu + command = dpcpp /GX ${emulator_flags} ${device_source_file} ${host_source_file} ${design_flags} -DFIXED_ITERATIONS=64 -DROWS_COMPONENT=128 -DCOLS_COMPONENT=128 -o $out + +rule gen_report + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 ${device_source_file} ${host_source_file} -DFIXED_ITERATIONS=64 -DROWS_COMPONENT=128 -DCOLS_COMPONENT=128 -Xsseed=5 -fsycl-link -o $out + +rule gen_report_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 ${device_source_file} ${host_source_file} -DFIXED_ITERATIONS=105 -DROWS_COMPONENT=256 -DCOLS_COMPONENT=256 -Xsseed=1 -fsycl-link -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu + +# report +build report: phony ${report_target} +build ${report_target}: gen_report + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_s10_pac} +build ${report_target_s10_pac}: gen_report_s10_pac diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.cpp new file mode 100755 index 0000000000..a6d973cbaa --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.cpp @@ -0,0 +1,318 @@ +// ============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of +// California and by the laws of the United States of America. + +#include +#include +#include +#include +#include + +#include "qrd.hpp" + +using std::vector; +using namespace sycl; + +template +struct Unroller { + template + static void Step(const Action &action) { + action(begin); + Unroller::Step(action); + } +}; + +template +struct Unroller { + template + static void Step(const Action &action) {} +}; + +struct MyComplex { + float xx; + float yy; + MyComplex(float x, float y) { + xx = x; + yy = y; + } + MyComplex() {} + const MyComplex operator+(const MyComplex other) const { + return MyComplex(xx + other.xx, yy + other.yy); + } +}; + +MyComplex MulMycomplex(MyComplex a, MyComplex b) { + MyComplex c; + c.xx = a.xx * b.xx + a.yy * b.yy; + c.yy = a.yy * b.xx - a.xx * b.yy; + return c; +} + +// Forward declare the kernel name +// (This will become unnecessary in a future compiler version.) +class QRD; + +void QRDecomposition(vector &in_matrix, vector &out_matrix, queue &q, + size_t matrices, size_t reps) { + // Number of complex elements in the matrix + constexpr int kNumComplexElements = COLS_COMPONENT * ROWS_COMPONENT; + + // Sizes of allocated memories for input and output matrix + constexpr int kInputMatrixSize = kNumComplexElements * 2; + constexpr int kOutputMatrixSize = + (ROWS_COMPONENT + 1) * COLS_COMPONENT * 3; + + // Constants related to the memory configuration of the kernel's local + // memories + // We want 4 complex elements (2 floating point values) in each memory bank + constexpr int kNumElementsPerBank = 4; + // Set the bankwidth in bytes + constexpr int kBankwidth = kNumElementsPerBank * 8; + constexpr int kNumBanks = ROWS_COMPONENT / kNumElementsPerBank; + + constexpr int kLoadIter = kNumComplexElements / kNumElementsPerBank; + constexpr int kStoreIter = kNumComplexElements / kNumElementsPerBank; + constexpr short kNumBuffers = 4; + + // We will process 'chunk' number of matrices in each run of the kernel + short chunk = 2048; + if (matrices % chunk) { + chunk = 1; + } + + // Create buffers and allocate space for them. + buffer *input_matrix[kNumBuffers], *output_matrix[kNumBuffers]; + for (short i = 0; i < kNumBuffers; i++) { + input_matrix[i] = new buffer(kInputMatrixSize * chunk); + output_matrix[i] = new buffer(kOutputMatrixSize * chunk); + } + + for (size_t r = 0; r < reps; r++) { + for (size_t b = 0, it = 0; it < matrices; + it += chunk, b = (b + 1) % kNumBuffers) { + const float *kPtr = in_matrix.data() + kInputMatrixSize * it; + float *kPtr2 = out_matrix.data() + kOutputMatrixSize * it; + int matrices = chunk; + + q.submit([&](handler &h) { + auto in_matrix2 = + input_matrix[b]->get_access(h); + h.copy(kPtr, in_matrix2); + }); + + q.submit([&](handler &h) { + auto in_matrix = input_matrix[b]->get_access(h); + auto out_matrix = + output_matrix[b]->get_access(h); + auto out_matrix2 = out_matrix; + h.single_task([=]() [[intel::kernel_args_restrict]] { + for (int l = 0; l < matrices; l++) { + [[intelfpga::bankwidth(kBankwidth), + intelfpga::numbanks(kNumBanks)]] struct { + MyComplex d[ROWS_COMPONENT]; + } a_matrix[COLS_COMPONENT], ap_matrix[COLS_COMPONENT], + aload_matrix[COLS_COMPONENT]; + + MyComplex vector_ai[ROWS_COMPONENT], vector_ti[ROWS_COMPONENT]; + MyComplex s_or_i[COLS_COMPONENT]; + + // Copy data from DDR memory to on-chip memory. + int idx = l * kNumComplexElements / kNumElementsPerBank; + for (short li = 0; li < kLoadIter; li++) { + MyComplex tmp[kNumElementsPerBank]; + Unroller<0, kNumElementsPerBank>::Step([&](int k) { + tmp[k].xx = in_matrix[idx * 2 * kNumElementsPerBank + k * 2]; + tmp[k].yy = + in_matrix[idx * 2 * kNumElementsPerBank + k * 2 + 1]; + }); + + idx++; + int jtmp = li % (kNumBanks); + + Unroller<0, kNumBanks>::Step([&](int k) { + Unroller<0, kNumElementsPerBank>::Step([&](int t) { + if (jtmp == k) { + aload_matrix[li / (kNumBanks)] + .d[k * kNumElementsPerBank + t].xx = tmp[t].xx; + aload_matrix[li / (kNumBanks)] + .d[k * kNumElementsPerBank + t].yy = tmp[t].yy; + } + + // Delay data signals to create a vine-based data distribution + // to lower signal fanout. + tmp[t].xx = intel::fpga_reg(tmp[t].xx); + tmp[t].yy = intel::fpga_reg(tmp[t].yy); + }); + + jtmp = intel::fpga_reg(jtmp); + }); + } + + float p_ii_x, i_r_ii_x; + short i = -1; + short j = N_VALUE - FIXED_ITERATIONS < 0 + ? (N_VALUE - FIXED_ITERATIONS) + : 0; + int qr_idx = l * kOutputMatrixSize / 2; + + [[intelfpga::ii(1)]] [[intelfpga::ivdep(FIXED_ITERATIONS)]] + for (int s = 0; s < ITERATIONS; s++) { + MyComplex vector_t[ROWS_COMPONENT]; + MyComplex sori[kNumBanks]; + + bool j_eq_i[kNumBanks], i_gt_0[kNumBanks], + i_ge_0_j_eq_i[kNumBanks], j_eq_i_plus_1[kNumBanks], + i_lt_0[kNumBanks]; + + Unroller<0, kNumBanks>::Step([&](int k) { + i_gt_0[k] = intel::fpga_reg(i > 0); + i_lt_0[k] = intel::fpga_reg(i < 0); + j_eq_i[k] = intel::fpga_reg(j == i); + i_ge_0_j_eq_i[k] = intel::fpga_reg(i >= 0 && j >= i); + j_eq_i_plus_1[k] = intel::fpga_reg(j == i + 1); + sori[k].xx = intel::fpga_reg(s_or_i[j].xx); + sori[k].yy = intel::fpga_reg(s_or_i[j].yy); + }); + + Unroller<0, ROWS_COMPONENT>::Step([&](int k) { + vector_t[k].xx = aload_matrix[j].d[k].xx; + vector_t[k].yy = aload_matrix[j].d[k].yy; + if (i_gt_0[k / kNumElementsPerBank]) { + vector_t[k].xx = a_matrix[j].d[k].xx; + vector_t[k].yy = a_matrix[j].d[k].yy; + } + if (j_eq_i[k / kNumElementsPerBank]) { + vector_ai[k].xx = vector_t[k].xx; + vector_ai[k].yy = vector_t[k].yy; + } + }); + + Unroller<0, ROWS_COMPONENT>::Step([&](int k) { + vector_t[k] = + MulMycomplex(vector_ai[k], + i_lt_0[k / kNumElementsPerBank] + ? MyComplex(0.0, 0.0) + : sori[k / kNumElementsPerBank]) + + (j_eq_i[k / kNumElementsPerBank] ? MyComplex(0.0, 0.0) + : vector_t[k]); + if (i_ge_0_j_eq_i[k / kNumElementsPerBank]) { + ap_matrix[j].d[k].xx = a_matrix[j].d[k].xx = + vector_t[k].xx; + ap_matrix[j].d[k].yy = a_matrix[j].d[k].yy = + vector_t[k].yy; + } + if (j_eq_i_plus_1[k / kNumElementsPerBank]) { + vector_ti[k] = vector_t[k]; + } + }); + + MyComplex p_ij = MyComplex(0, 0); + Unroller<0, ROWS_COMPONENT>::Step([&](int k) { + p_ij = p_ij + MulMycomplex(vector_t[k], vector_ti[k]); + }); + + if (j == i + 1) { + p_ii_x = p_ij.xx; + i_r_ii_x = rsqrt(p_ij.xx); + } + + MyComplex s_ij = + MyComplex(0.0f - (p_ij.xx) / p_ii_x, p_ij.yy / p_ii_x); + + if (j >= 0) { + s_or_i[j] = MyComplex(j == i + 1 ? i_r_ii_x : s_ij.xx, + j == i + 1 ? 0.0f : s_ij.yy); + } + + MyComplex r_ii = j == i + 1 ? MyComplex(sycl::sqrt(p_ii_x), 0.0) + : MyComplex(i_r_ii_x * p_ij.xx, + i_r_ii_x * p_ij.yy); + + if (j >= i + 1 && i + 1 < N_VALUE) { + out_matrix[qr_idx * 2] = r_ii.xx; + out_matrix[qr_idx * 2 + 1] = r_ii.yy; + qr_idx++; + } + + if (j == N_VALUE - 1) { + j = ((N_VALUE - FIXED_ITERATIONS) > i) + ? (i + 1) + : (N_VALUE - FIXED_ITERATIONS); + i++; + } else { + j++; + } + } + + qr_idx /= 4; + for (short si = 0; si < kStoreIter; si++) { + int desired = si % (kNumBanks); + bool get[kNumBanks]; + Unroller<0, kNumBanks>::Step([&](int k) { + get[k] = desired == k; + desired = intel::fpga_reg(desired); + }); + + MyComplex tmp[kNumElementsPerBank]; + Unroller<0, kNumBanks>::Step([&](int t) { + Unroller<0, kNumElementsPerBank>::Step([&](int k) { + tmp[k].xx = get[t] ? ap_matrix[si / (kNumBanks)] + .d[t * kNumElementsPerBank + k] + .xx + : intel::fpga_reg(tmp[k].xx); + tmp[k].yy = get[t] ? ap_matrix[si / (kNumBanks)] + .d[t * kNumElementsPerBank + k] + .yy + : intel::fpga_reg(tmp[k].yy); + }); + }); + + Unroller<0, 4>::Step([&](int k) { + out_matrix2[qr_idx * 2 * kNumElementsPerBank + k * 2] = + tmp[k].xx; + out_matrix2[qr_idx * 2 * kNumElementsPerBank + k * 2 + 1] = + tmp[k].yy; + }); + + qr_idx++; + } + } + }); + }); + + q.submit([&](handler &h) { + auto final_matrix = output_matrix[b]->get_access(h); + h.copy(final_matrix, kPtr2); + }); + } + } + + for (short b = 0; b < kNumBuffers; b++) { + delete input_matrix[b]; + delete output_matrix[b]; + } +} diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.hpp new file mode 100755 index 0000000000..4ada530ea7 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.hpp @@ -0,0 +1,43 @@ +// ============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of +// California and by the laws of the United States of America. + +// The values for FIXED_ITERATIONS, ROWS_COMPONENT and COLS_COMPONENT will be +// supplied by the build system (cmake/build.ninja) + +// Architecture/Design Parameters used to implement the triagular loop +// structure of the design. See the tutorial on triangular loop optimization +// for more details. +#define N_VALUE COLS_COMPONENT + +#define M_MINUS_COLS \ + (FIXED_ITERATIONS > COLS_COMPONENT ? FIXED_ITERATIONS - COLS_COMPONENT : 0) + +#define ITERATIONS \ + (COLS_COMPONENT + M_MINUS_COLS + (COLS_COMPONENT + 1) * COLS_COMPONENT / 2 + \ + FIXED_ITERATIONS * (FIXED_ITERATIONS - 1) / 2 - \ + M_MINUS_COLS * (M_MINUS_COLS - 1) / 2) diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd_demo.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd_demo.cpp new file mode 100755 index 0000000000..4bee78a672 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd_demo.cpp @@ -0,0 +1,233 @@ +// ============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// This agreement shall be governed in all respects by the laws of the State of +// California and by the laws of the United States of America. + +#include + +#include +#include +#include +#include + +#include "dpc_common.hpp" +#include "qrd.hpp" + +using namespace std; +using namespace std::chrono; +using namespace sycl; + +// Run the modified Gram-Schmidt QR Decomposition algorithm on the given +// matrices. The function will do the following: +// 1. Transfer the input matrices to the FPGA. +// 2. Run the algorithm. +// 3. Copy the output data back to host device. +// The above process is carried out 'reps' number of times. +void QRDecomposition(vector &in_matrix, vector &out_matrix, queue &q, + size_t matrices, size_t reps); + +int main(int argc, char *argv[]) { + constexpr size_t kRandomSeed = 1138; + constexpr size_t kRandomMin = 1; + constexpr size_t kRandomMax = 10; + + size_t matrices = argc > 1 ? atoi(argv[1]) : 1; + if (matrices < 1) { + cout << "Must run at least 1 matrix\n"; + return 1; + } + + try { +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector device_selector; +#else + intel::fpga_selector device_selector; +#endif + + queue q = queue(device_selector, dpc_common::exception_handler); + device device = q.get_device(); + cout << "Device name: " << device.get_info().c_str() + << "\n"; + + vector a_matrix; + vector qr_matrix; + + constexpr size_t kAMatrixSizeFactor = ROWS_COMPONENT * COLS_COMPONENT * 2; + constexpr size_t kQRMatrixSizeFactor = + (ROWS_COMPONENT + 1) * COLS_COMPONENT * 3; + constexpr size_t kIndexAccessFactor = 2; + + a_matrix.resize(matrices * kAMatrixSizeFactor); + qr_matrix.resize(matrices * kQRMatrixSizeFactor); + + // For output-postprocessing + float q_matrix[ROWS_COMPONENT][COLS_COMPONENT][2]; + float r_matrix[COLS_COMPONENT][COLS_COMPONENT][2]; + + cout << "Generating " << matrices << " random matri" + << ((matrices == 1) ? "x " : "ces ") << "\n"; + + srand(kRandomSeed); + + for (size_t i = 0; i < matrices; i++) { + for (size_t row = 0; row < ROWS_COMPONENT; row++) { + for (size_t col = 0; col < COLS_COMPONENT; col++) { + int random_val = rand(); + float random_double = + random_val % (kRandomMax - kRandomMin) + kRandomMin; + a_matrix[i * kAMatrixSizeFactor + + col * ROWS_COMPONENT * kIndexAccessFactor + + row * kIndexAccessFactor] = random_double; + int random_val_imag = rand(); + random_double = + random_val_imag % (kRandomMax - kRandomMin) + kRandomMin; + a_matrix[i * kAMatrixSizeFactor + + col * ROWS_COMPONENT * kIndexAccessFactor + + row * kIndexAccessFactor + 1] = random_double; + } + } + } + + QRDecomposition(a_matrix, qr_matrix, q, 1, 1); // Accelerator warmup + +#if defined(FPGA_EMULATOR) + size_t reps = 2; +#else + size_t reps = 32; +#endif + cout << "Running QR decomposition of " << matrices << " matri" + << ((matrices == 1) ? "x " : "ces ") + << ((reps > 1) ? "repeatedly" : "") << "\n"; + + high_resolution_clock::time_point start_time = high_resolution_clock::now(); + QRDecomposition(a_matrix, qr_matrix, q, matrices, reps); + high_resolution_clock::time_point end_time = high_resolution_clock::now(); + duration diff = end_time - start_time; + q.throw_asynchronous(); + + cout << " Total duration: " << diff.count() << " s" + << "\n"; + cout << "Throughput: " << reps * matrices / diff.count() / 1000 + << "k matrices/s" + << "\n"; + + list to_check; + // We will check at least matrix 0 + to_check.push_back(0); + // Spot check the last and the middle one + if (matrices > 2) to_check.push_back(matrices / 2); + if (matrices > 1) to_check.push_back(matrices - 1); + + cout << "Verifying results on matrix"; + + for (size_t matrix : to_check) { + cout << " " << matrix; + size_t idx = 0; + for (size_t i = 0; i < COLS_COMPONENT; i++) { + for (size_t j = 0; j < COLS_COMPONENT; j++) { + if (j < i) + r_matrix[i][j][0] = r_matrix[i][j][1] = 0; + else { + r_matrix[i][j][0] = qr_matrix[matrix * kQRMatrixSizeFactor + idx++]; + r_matrix[i][j][1] = qr_matrix[matrix * kQRMatrixSizeFactor + idx++]; + } + } + } + + for (size_t j = 0; j < COLS_COMPONENT; j++) { + for (size_t i = 0; i < ROWS_COMPONENT; i++) { + q_matrix[i][j][0] = qr_matrix[matrix * kQRMatrixSizeFactor + idx++]; + q_matrix[i][j][1] = qr_matrix[matrix * kQRMatrixSizeFactor + idx++]; + } + } + + float acc_real = 0; + float acc_imag = 0; + float v_matrix[ROWS_COMPONENT][COLS_COMPONENT][2] = {{{0}}}; + for (size_t i = 0; i < ROWS_COMPONENT; i++) { + for (size_t j = 0; j < COLS_COMPONENT; j++) { + acc_real = 0; + acc_imag = 0; + for (size_t k = 0; k < COLS_COMPONENT; k++) { + acc_real += q_matrix[i][k][0] * r_matrix[k][j][0] - + q_matrix[i][k][1] * r_matrix[k][j][1]; + acc_imag += q_matrix[i][k][0] * r_matrix[k][j][1] + + q_matrix[i][k][1] * r_matrix[k][j][0]; + } + v_matrix[i][j][0] = acc_real; + v_matrix[i][j][1] = acc_imag; + } + } + + float error = 0; + size_t count = 0; + constexpr float kErrorThreshold = 1e-4; + for (size_t row = 0; row < ROWS_COMPONENT; row++) { + for (size_t col = 0; col < COLS_COMPONENT; col++) { + if (std::isnan(v_matrix[row][col][0]) || + std::isnan(v_matrix[row][col][1])) { + count++; + } + float real = v_matrix[row][col][0] - + a_matrix[matrix * kAMatrixSizeFactor + + col * ROWS_COMPONENT * kIndexAccessFactor + + row * kIndexAccessFactor]; + float imag = v_matrix[row][col][1] - + a_matrix[matrix * kAMatrixSizeFactor + + col * ROWS_COMPONENT * kIndexAccessFactor + + row * kIndexAccessFactor + 1]; + if (sqrt(real * real + imag * imag) >= kErrorThreshold) { + error += sqrt(real * real + imag * imag); + count++; + } + } + } + + if (count > 0) { + cout << "\nFAILED\n"; + cout << "\n" + << "!!!!!!!!!!!!!! Error = " << error << " in " << count << " / " + << ROWS_COMPONENT * COLS_COMPONENT << "\n"; + return 1; + } + } + + cout << "\nPASSED\n"; + return 0; + + } catch (sycl::exception const &e) { + cout << "Caught a synchronous SYCL exception: " << e.what() << "\n"; + cout << " If you are targeting an FPGA hardware, " + "ensure that your system is plugged to an FPGA board that is " + "set up correctly" + << "\n"; + cout << " If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR" + << "\n"; + + terminate(); + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/CMakeLists.txt new file mode 100755 index 0000000000..5c0cea463c --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(DoubleBuffering) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/README.md b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/README.md new file mode 100755 index 0000000000..31b7e3df37 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/README.md @@ -0,0 +1,223 @@ +# Double Buffering to Overlap Kernel Execution with Buffer Transfers and Host Processing +This FPGA tutorial demonstrates how to parallelize host-side processing and buffer transfers between host and device with kernel execution, which can improve overall application performance. + +***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | How and when to implement the double buffering optimization technique +| Time to complete | 30 minutes + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + +## Purpose +In an application where the FPGA kernel is executed multiple times, the host must perform the following processing and buffer transfers before each kernel invocation. +1. The output data from the *previous* invocation must be transferred from device to host and then processed by the host. Examples of this processing include: + * Copying the data to another location + * Rearranging the data + * Verifying it in some way +2. The input data for the *next* invocation must be processed by the host and then transferred to the device. Examples of this processing include: + * Copying the data from another location + * Rearranging the data for kernel consumption + * Generating the data in some way + +Without double buffering, host processing and buffer transfers occur *between* kernel executions. Therefore, there is a gap in time between kernel executions, which you can refer to as kernel *downtime* (see diagram below). If these operations overlap with kernel execution, the kernels can execute back-to-back with minimal downtime, thereby increasing overall application performance. + +### Determining When is Double Buffering Possible + +Let's define the required variables: +* **R** = Time to transfer the kernel's output buffer from device to host. +* **Op** = Host-side processing time of kernel output data (*output processing*) +* **Ip** = Host-side processing time for kernel input data (*input processing*) +* **W** = Time to transfer the kernel's input buffer from host to device. +* **K** = Kernel execution time + +![](downtime.png) + +In general, **R**, **Op**, **Ip**, and **W** operations must all complete before the next kernel is launched. To maximize performance, while one kernel is executing on the device, these operations should execute simultaneously on the host and operate on a second set of buffer locations. They should complete before the current kernel completes, thus allowing the next kernel to be launched immediately with no downtime. In general, to maximize performance, the host must launch a new kernel every **K**. + +This leads to the following constraint: + +```c++ +R + Op + Ip + W <= K, in order to minimize kernel downtime. +``` +If the above constraint is not satisfied, a performance improvement may still be observed because *some* overlap (perhaps not complete overlap) is still possible. Further improvement is possible by extending the double buffering concept to N-way buffering (see the corresponding tutorial). + +### Measuring the Impact of Double Buffering + +You must get a sense of the kernel downtime to identify the degree to which this technique can help improve performance. + +This can be done by querying the total kernel execution time from the runtime and comparing it to the overall application execution time. In an application where kernels execute with minimal downtime, these two numbers will be close. However, if kernels have a lot of downtime, overall execution time will notably exceed kernel execution time. The tutorial code exemplifies how to do this. + +### Tutorial Implementation Notes + +The basic idea is to: +1. Perform the input processing for the first two kernel executions and queue them both. +2. Immediately call the `process_output()` method (automatically blocked by the SYCL* runtime) on the first kernel completing because of the implicit data dependency. +3. When the first kernel completes, the second kernel begins executing immediately because it was already queued. +4. While the second kernel runs, the host processes the output data from the first kernel and prepares the input data for the third kernel. +5. As long as the above operations complete before the second kernel completes, the third kernel is queued early enough to allow it to be launched immediately after the second kernel. + +The process then repeats. + +The impact of double buffering on the total runtime of the tutorial program will be analyzed in the "Running the Sample" section below. + +## Key Concepts +* The double buffering optimization technique +* Determining when double buffering is beneficial +* How to measure the impact of double buffering + +## License +This code sample is licensed under MIT license. + + +## Building the `double_buffering` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + +### On a Windows* System +Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. + +1. Enter the source file directory. + ``` + cd src + ``` + +2. Compile the design. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + ninja fpga_emu + ``` + + * Generate the optimization report: + + ``` + ninja report + ``` + If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use: + ``` + ninja report_s10_pac + ``` + * Compiling for FPGA hardware is not yet supported on Windows. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Examining the Reports +Locate `report.html` in the `double_buffering_report.prj/reports/` or `double_buffering_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. + +Note that because the optimization described in this tutorial takes place at the *runtime* level, the FPGA compiler report will not show a difference between the optimized and unoptimized cases. + + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./double_buffering.fpga_emu (Linux) + double_buffering.fpga_emu.exe (Windows) + ``` +2. Run the sample on the FPGA device: + ``` + ./double_buffering.fpga (Linux) + ``` + +### Example of Output + +``` +Platform name: Intel(R) FPGA SDK for OpenCL(TM) +Device name: pac_a10 : Intel PAC Platform (pac_ee00000) + + +Executing kernel 100 times in each round. + +*** Beginning execution, without double buffering +Launching kernel #0 +Launching kernel #10 +Launching kernel #20 +Launching kernel #30 +Launching kernel #40 +Launching kernel #50 +Launching kernel #60 +Launching kernel #70 +Launching kernel #80 +Launching kernel #90 + +Overall execution time without double buffering = 29742 ms +Total kernel-only execution time without double buffering = 17856 ms +Throughput = 35.255249 MB/s + + +*** Beginning execution, with double buffering. +Launching kernel #0 +Launching kernel #10 +Launching kernel #20 +Launching kernel #30 +Launching kernel #40 +Launching kernel #50 +Launching kernel #60 +Launching kernel #70 +Launching kernel #80 +Launching kernel #90 + +Overall execution time with double buffering = 17967 ms +Total kernel-only execution time with double buffering = 17869 ms +Throughput = 58.35976 MB/s + + +Verification PASSED +``` + +### Discussion of Results + +A test compile of this tutorial design achieved a maximum frequency (fMAX) of approximately 340 MHz on the Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA. The results with and without double buffering are shown in the following table: + +Configuration | Overall Execution Time (ms) | Total Kernel Execution time (ms) +-|-|- +Without double buffering | 23462 | 15187 +With double buffering | 15145 | 15034 + +In both runs, the total kernel execution time is similar, as expected. However, without double buffering, the overall execution time notably exceeds the total kernel execution time, implying there is downtime between kernel executions. With double buffering, the overall execution time is close to the the total kernel execution time. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.sln b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.sln new file mode 100755 index 0000000000..4108b65da8 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "double_buffering", "double_buffering.vcxproj", "{6910A54A-BFE5-462F-9F3B-B84F62C5ADD1}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {6910A54A-BFE5-462F-9F3B-B84F62C5ADD1}.Debug|x64.ActiveCfg = Debug|x64 + {6910A54A-BFE5-462F-9F3B-B84F62C5ADD1}.Debug|x64.Build.0 = Debug|x64 + {6910A54A-BFE5-462F-9F3B-B84F62C5ADD1}.Release|x64.ActiveCfg = Release|x64 + {6910A54A-BFE5-462F-9F3B-B84F62C5ADD1}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {1878B8F8-3C90-4CB5-9A71-66501FA4A3BA} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.vcxproj new file mode 100755 index 0000000000..b7ee382578 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.vcxproj @@ -0,0 +1,160 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {6910a54a-bfe5-462f-9f3b-b84f62c5add1} + Win32Proj + double_buffering + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)double_buffering.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)double_buffering.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/downtime.png b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/downtime.png new file mode 100755 index 0000000000..2a306929bc Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/downtime.png differ diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/sample.json new file mode 100755 index 0000000000..b10e6e185a --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/sample.json @@ -0,0 +1,51 @@ +{ + "guid": "B210B44F-FB86-4F42-BA4A-9980805350FF", + "name": "Overlapping Kernel Execution with Buffer Transfers and Host Processing through Double Buffering", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "FPGA tutorial design to demonstrate overlapping kernel execution with buffer transfers and host-processing to improve system performance", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./double_buffering.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "double_buffering.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/CMakeLists.txt new file mode 100755 index 0000000000..f918135042 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/CMakeLists.txt @@ -0,0 +1,89 @@ +set(SOURCE_FILE double_buffering.cpp) +set(TARGET_NAME double_buffering) + +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS "-fintelfpga") + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR") +set(EMULATOR_LINK_FLAGS "-fintelfpga") + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS}) + set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) +endif() + +# generate report +if(WIN32) + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) + +else() + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/build.ninja new file mode 100755 index 0000000000..3e8fdc6126 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/build.ninja @@ -0,0 +1,30 @@ +source_file = double_buffering.cpp +target_name = double_buffering + +emulator_target = ${target_name}.fpga_emu.exe +report_target = ${target_name}_report.a +report_target_s10_pac = ${target_name}_s10_pac_report.a + +hardware_flags = -fintelfpga -Xshardware +emulator_flags = -fintelfpga -DFPGA_EMULATOR + +rule build_fpga_emu + command = dpcpp /GX ${emulator_flags} $in -o $out + +rule gen_report + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out + +rule gen_report_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu ${source_file} + +# report +build report: phony ${report_target} +build ${report_target}: gen_report ${source_file} + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_s10_pac} +build ${report_target_s10_pac}: gen_report_s10_pac ${source_file} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/double_buffering.cpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/double_buffering.cpp new file mode 100755 index 0000000000..556507e307 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/double_buffering.cpp @@ -0,0 +1,349 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include +#include + +#include "dpc_common.hpp" + +using namespace sycl; + +// kTimes = # times to execute the kernel. kTimes must be >= 2 +// kSize = # of floats to process on each kernel execution. +// run less in emulation to avoid high run time +#if defined(FPGA_EMULATOR) +constexpr int kTimes = 20; +constexpr int kSize = 4096; +#else +constexpr int kTimes = 100; +constexpr int kSize = 2621440; +#endif + +// Kernel executes a power function (base^kPow). Must be +// >= 2. Can increase this to increase kernel execution +// time, but ProcessOutput() time will also increase. +constexpr int kPow = 20; + +// Number of iterations through the main loop +constexpr int kNumRuns = 2; + +bool pass = true; + +class SimpleVpow; + +/* Kernel function. + Performs buffer_b[i] = buffer_a[i] ** pow + Only supports pow >= 2. + This kernel is not meant to be an optimal implementation of the power + operation -- it's just a sample kernel for this tutorial whose execution time + is easily controlled via the pow parameter. SYCL buffers are created + externally and passed in by reference to control (external to this function) + when the buffers are destructed. The destructor causes a blocking buffer + transfer from device to host and double buffering requires us to not block + here (because we need to launch another kernel). So we only want this + transfer to occur at the end of overall execution, not at the end of each + individual kernel execution. +*/ +void SimplePow(std::unique_ptr &q, buffer &buffer_a, + buffer &buffer_b, event &e) { + // Submit to the queue and execute the kernel + e = q->submit([&](handler &h) { + // Get kernel access to the buffers + auto accessor_a = buffer_a.get_access(h); + auto accessor_b = buffer_b.get_access(h); + + const int num = kSize; + assert(kPow >= 2); + const int p = kPow - 1; // Assumes pow >= 2; + + h.single_task([=]() [[intel::kernel_args_restrict]] { + for (int j = 0; j < p; j++) { + if (j == 0) { + for (int i = 0; i < num; i++) { + accessor_b[i] = accessor_a[i] * accessor_a[i]; + } + } else { + for (int i = 0; i < num; i++) { + accessor_b[i] = accessor_b[i] * accessor_a[i]; + } + } + } + }); + }); + + event update_host_event; + update_host_event = q->submit([&](handler &h) { + auto accessor_b = buffer_b.get_access(h); + + /* + Explicitly instruct the SYCL runtime to copy the kernel's output buffer + back to the host upon kernel completion. This is not required for + functionality since the buffer access in ProcessOutput() also implicitly + instructs the runtime to copy the data back. But it should be noted that + this buffer access blocks ProcessOutput() until the kernel is complete + and the data is copied. In contrast, update_host() instructs the runtime + to perform the copy earlier. This allows ProcessOutput() to optionally + perform more useful work *before* making the blocking buffer access. Said + another way, this allows ProcessOutput() to potentially perform more work + in parallel with the runtime's copy operation. + */ + h.update_host(accessor_b); + }); +} + +// Returns kernel execution time for a given SYCL event from a queue. +ulong SyclGetExecTimeNs(event e) { + ulong start_time = + e.get_profiling_info(); + ulong end_time = + e.get_profiling_info(); + return (end_time - start_time); +} + +// Local pow function for verifying results +float MyPow(float input, int pow) { + return (pow == 0) ? 1 : input * MyPow(input, pow - 1); +} + +/* Compares kernel output against expected output. Only compares part of the + output so that this method completes quickly. This is done + intentionally/artificially keep host-processing time shorter than kernel + execution time. Grabs kernel output data from its SYCL buffer. Reading from + this buffer is a blocking operation that will block on the kernel completing. + Queries and records execution time of the kernel that just completed. This + is a natural place to do this because ProcessOutput() is blocked on kernel + completion. +*/ +void ProcessOutput(buffer &input_buf, + buffer &output_buf, int exec_number, event e, + ulong &total_kernel_time_per_slot) { + auto input_buf_acc = input_buf.get_access(); + auto output_buf_acc = output_buf.get_access(); + int num_errors = 0; + int num_errors_to_print = 10; + /* The use of update_host() in the kernel function allows for additional + host-side operations to be performed here, in parallel with the buffer copy + operation from device to host, before the blocking access to the output + buffer is made via output_buf_acc[]. To be clear, no real operations are + done here and this is just a note that this is the place + where you *could* do it. */ + for (int i = 0; i < kSize / 8; i++) { + const bool out_valid = (MyPow(input_buf_acc[i], kPow) != output_buf_acc[i]); + if ((num_errors < num_errors_to_print) && out_valid) { + if (num_errors == 0) { + pass = false; + std::cout << "Verification failed on kernel execution # " << exec_number + << ". Showing up to " << num_errors_to_print + << " mismatches.\n"; + } + std::cout << "Verification failed on kernel execution # " << exec_number + << ", at element " << i << ". Expected " << std::fixed + << std::setprecision(16) << MyPow(input_buf_acc[i], kPow) + << " but got " << output_buf_acc[i] << "\n"; + num_errors++; + } + } + + // At this point we know the kernel has completed, + // so can query the profiling data. + total_kernel_time_per_slot += SyclGetExecTimeNs(e); +} + +/* + Generates input data for the next kernel execution. Only fills part of the + buffer so that this method completes quickly. This is done + intentionally/artificially keep host-processing time shorter than kernel + execution time. Writes the data into the associated SYCL buffer. The write + will block until the previous kernel execution, that is using this buffer, + completes. +*/ +void ProcessInput(buffer &buf) { + // We are generating completely new input data, so can use discard_write() + // here to indicate we don't care about the SYCL buffer's current contents. + auto buf_acc = buf.get_access(); + + // RNG seed + auto seed = std::chrono::system_clock::now().time_since_epoch().count(); + + // RNG engine + std::default_random_engine dre(seed); + + // generate random numbers between 1 and 2 + std::uniform_real_distribution di(1.0f, 2.0f); + + // Randomly generate a start value and increment from there. + // Compared to randomly generating every value, this is done to + // speed up this function a bit. + float start_val = di(dre); + + for (int i = 0; i < kSize / 8; i++) { + buf_acc[i] = start_val; + start_val++; + } +} + +int main() { +// Create queue, get platform and device +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector device_selector; + std::cout << "\nEmulator output does not demonstrate true hardware " + "performance. The design may need to run on actual hardware " + "to observe the performance benefit of the optimization " + "exemplified in this tutorial.\n\n"; +#else + intel::fpga_selector device_selector; +#endif + + try { + auto prop_list = + property_list{property::queue::enable_profiling()}; + + std::unique_ptr q; + q.reset(new queue(device_selector, dpc_common::exception_handler, prop_list)); + + platform platform = q->get_context().get_platform(); + device device = q->get_device(); + std::cout << "Platform name: " + << platform.get_info().c_str() << "\n"; + std::cout << "Device name: " + << device.get_info().c_str() << "\n\n\n"; + + std::cout << "Executing kernel " << kTimes << " times in each round.\n\n"; + + // Create a vector to store the input/output SYCL buffers + std::vector> input_buf; + std::vector> output_buf; + + // SYCL events for each kernel launch. + event sycl_events[2]; + + // In nanoseconds. Total execution time of kernels in a given slot. + ulong total_kernel_time_per_slot[2]; + + // Total execution time of all kernels. + ulong total_kernel_time = 0; + + // Allocate vectors to store the host-side copies of the input data + // Create and allocate the SYCL buffers + for (int i = 0; i < 2; i++) { + input_buf.push_back(buffer(range<1>(kSize))); + output_buf.push_back(buffer(range<1>(kSize))); + } + + /* + Main loop. This loop runs twice to show the performance difference without + and with double buffering. + */ + for (int i = 0; i < kNumRuns; i++) { + for (int i = 0; i < 2; i++) { + total_kernel_time_per_slot[i] = 0; // Initialize timers to zero. + } + + switch (i) { + case 0: { + std::cout << "*** Beginning execution, without double buffering\n"; + break; + } + case 1: { + std::cout << "*** Beginning execution, with double buffering.\n"; + break; + } + default: { + std::cout << "*** Beginning execution.\n"; + } + } + + // Start the timer. This will include the time to process the input data + // for the first 2 kernel executions. + dpc_common::TimeInterval exec_time; + + if (i == 0) { // Single buffering + for (int i = 0; i < kTimes; i++) { + // Only print every few iterations, just to limit the prints. + if (i % 10 == 0) { + std::cout << "Launching kernel #" << i << "\n"; + } + + ProcessInput(input_buf[0]); + SimplePow(q, input_buf[0], output_buf[0], sycl_events[0]); + ProcessOutput(input_buf[0], output_buf[0], i, sycl_events[0], + total_kernel_time_per_slot[0]); + } + } else { // Double buffering + // Process input for first 2 kernel launches and queue them. Then block + // on processing the output of the first kernel. + ProcessInput(input_buf[0]); + ProcessInput(input_buf[1]); + + std::cout << "Launching kernel #0\n"; + + SimplePow(q, input_buf[0], output_buf[0], sycl_events[0]); + for (int i = 1; i < kTimes; i++) { + if (i % 10 == 0) { + std::cout << "Launching kernel #" << i << "\n"; + } // Only print every few iterations, just to limit the prints. + + // Launch the next kernel + SimplePow(q, input_buf[i % 2], output_buf[i % 2], sycl_events[i % 2]); + + // Process output from previous kernel. This will block on kernel + // completion. + ProcessOutput(input_buf[(i - 1) % 2], output_buf[(i - 1) % 2], i, + sycl_events[(i - 1) % 2], + total_kernel_time_per_slot[(i - 1) % 2]); + + // Generate input for the next kernel. + ProcessInput(input_buf[(i - 1) % 2]); + } + + // Process output of the final kernel + ProcessOutput(input_buf[(kTimes - 1) % 2], output_buf[(kTimes - 1) % 2], + i, sycl_events[(kTimes - 1) % 2], + total_kernel_time_per_slot[(kTimes - 1) % 2]); + } + + // Add up the overall kernel execution time. + total_kernel_time = 0; + for (int i = 0; i < 2; i++) { + total_kernel_time += total_kernel_time_per_slot[i]; + } + + // Stop the timer. + double time_span = exec_time.Elapsed(); + + std::cout << "\nOverall execution time " + << ((i == 0) ? "without" : "with") << " double buffering = " + << (unsigned)(time_span * 1000) << " ms\n"; + std::cout << "Total kernel-only execution time " + << ((i == 0) ? "without" : "with") << " double buffering = " + << (unsigned)(total_kernel_time / 1000000) << " ms\n"; + std::cout << "Throughput = " << std::setprecision(8) + << (float)kSize * (float)kTimes * (float)sizeof(float) / + (float)time_span / 1000000 + << " MB/s\n\n\n"; + } + if (pass) { + std::cout << "Verification PASSED\n"; + } else { + std::cout << "Verification FAILED\n"; + return 1; + } + } catch (sycl::exception const& e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/CMakeLists.txt new file mode 100755 index 0000000000..134e6d8534 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/CMakeLists.txt @@ -0,0 +1,12 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + + +cmake_minimum_required (VERSION 2.8) + +project(NWayBuffering) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/README.md b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/README.md new file mode 100755 index 0000000000..d4fb12ba40 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/README.md @@ -0,0 +1,297 @@ + +# N-Way Buffering to Overlap Kernel Execution with Buffer Transfers and Host Processing + +This FPGA tutorial demonstrates how to parallelize host-side processing and buffer transfers between host and device with kernel execution to improve overall application performance. It is a generalization of the 'double buffering' technique, and can be used to perform this overlap even when the host-processing time exceeds kernel execution time. + +***Documentation***: The [FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a resource for general target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | How and when to apply the N-way buffering optimization technique +| Time to complete | 30 minutes + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + +## Purpose +N-Way buffering is a generalization of the double buffering optimization technique (see the "Double Buffering" FPGA tutorial). This system-level optimization enables kernel execution to occur in parallel with host-side processing and buffer transfers between host and device, improving application performance. N-way buffering can achieve this overlap even when the host-processing time exceeds kernel execution time. + +### Background + +In an application where the FPGA kernel is executed multiple-times, the host must perform the following processing and buffer transfers before each kernel invocation: +1. The output data from the *previous* invocation must be transferred from the device to host and then processed by the host. Examples of this processing include the following: + * Copying the data to another location + * Rearranging the data + * Verifying it in some way +2. The input data for the *next* invocation must be processed by the host and then transferred to the device. Examples of this processing include: + * Copying the data from another location + * Rearranging the data for kernel consumption + * Generating the data in some way + +Without the technique described in this tutorial, host processing and buffer transfers occur *between* kernel executions. Therefore, there is a gap in time between kernel executions, which you can refer to as kernel "downtime" (see diagram below). If these operations overlap with kernel execution, the kernels can execute back-to-back with minimal downtime, thereby increasing overall application performance. + +### N-Way Buffering + +This technique is referred to as *N-Way Buffering*, but is frequently called *double buffering* in the most common case where N=2. + +Let's first define some variables: + +| Variable | Description | +| ------ | ------ | +| **R** | Time to transfer the kernel's output buffer from device to host. | +| **Op** | Host-side processing time of kernel output data (*output processing*). | +| **Ip** | Host-side processing time for kernel input data (*input processing*). | +| **W** | Time to transfer the kernel's input buffer from host to device. | +| **K** | Kernel execution time. | +| **N** | Number of buffer sets used. | +| **C** | Number of host-side CPU cores. | + + + +![](downtime.png) + +In general, the **R**, **Op**, **Ip**, and **W** operations must all complete before the next kernel is launched. To maximize performance, while one kernel is executing on the device, these operations should run in parallel and operate on a separate set of buffer locations. You should complete before the current kernel completes, thus allowing the next kernel to be launched immediately with no downtime. In general, to maximize performance, the host must launch a new kernel every **K**. + +If these host-side operations are executed serially, this leads to the following constraint: + +```c++ +R + Op + Ip + W <= K, to minimize kernel downtime. +``` + +In the above example, if the constraint is satisfied, the application requires two sets of buffers. In this case, **N**=2. + +However, the above constraint may not be satisfied in some applications (i.e., if host-processing takes longer than the kernel execution time). + +**NOTE**: A performance improvement may still be observed because kernel downtime may still be reduced (though perhaps not maximally reduced). + +In this case, to further improve performance, the reduce host-processing time through multi-threading. Rather than executing the above operations serially, perform the input- and output-processing operations in parallel using two threads, leading to the following constraint: + +```c++ +Max (R+Op, Ip+W) <= K +and +R + W <= K, to minimize kernel downtime. +```` + +If the above constraint is still unsatisfied, the technique can be extended beyond two sets of buffers to **N** sets of buffers to help improve the degree of overlap. In this case, the constraint becomes: + +```c++ +Max (R + Op, Ip + W) <= (N-1)*K +and +R + W <= K, to minimize kernel downtime. +``` + +The idea of N-way buffering is to prepare **N** sets of kernel input buffers, launch **N** kernels, and when the first kernel completes, begin the subsequent host-side operations. These operations may take a long time (longer than **K**), but they do not cause kernel downtime because an additional **N**-1 kernels have already been queued and can launch immediately. By the time these first **N** kernels complete, the aforementioned host-side operations would have also completed and the **N**+1 kernel can be launched with no downtime. As additional kernels complete, corresponding host-side operations are launched on the host, in a parallel fashion, using multiple threads. Although the host operations take longer than **K**, if **N** is chosen correctly, they will complete with a period of **K**, which is required to ensure we can launch a new kernel every **K**. To reiterate, this scheme requires multi-threaded host-operations because the host must perform processing for up to **N** kernels in parallel in order to keep up. + +The above formula can be used to calculate the **N** required to minimize downtime. However, there are some practical limits: +* **N** sets of buffers are required on both the host and device, therefore both must have the capacity for this many buffers. +* If the input and output processing operations are launched in separate threads, then (**N**-1)*2 cores are required, so **C** can be become the limiting factor. + +### Measuring the Impact of N-Way Buffering + +You must get a sense of the kernel downtime to identify the degree to which this technique can help improve performance. + +This can be done by querying total kernel execution time from the runtime and comparing it to with overall application execution time. In an application where kernels execute with minimal downtime, these two numbers are close. However, if kernels have a lot of downtime, overall execution time notably exceeds the kernel execution time. The tutorial code exemplifies how to do this. + +### Tutorial Implementation Notes + +The example code runs with multiple iterations to illustrate how performance improves as **N** increases and as multi-threading is used. + +It is useful to think of the execution space as having **N** slots where the slots execute in chronological order, and each slot has its own set of buffers on the host and device. At the beginning of execution, the host prepares the kernel input data for the **N** slots and launches **N** kernels. When slot-0 completes, slot-1 begins executing immediately because it was already queued. The host begins both the output and input processing for slot-0. These two operations must complete before the host can queue another kernel into slot-0. The same is true for all slots. + +After each kernel is launched, the host-side operations (that occur *after* the kernel in that slot completes) are launched immediately from the `main()` program. They block until the kernel execution for that slot completes (this is enforced by the runtime). + + +## Key Concepts +* The N-way buffering optimization technique as a generalization of double buffering +* Determining when N-way buffering is practical and beneficial +* How to measure the impact of N-way buffering + +## License +This code sample is licensed under MIT license. + + +## Building the `n_way_buffering` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + +### On a Windows* System +Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. + +1. Enter the source file directory. + ``` + cd src + ``` + +2. Compile the design. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + ninja fpga_emu + ``` + + * Generate the optimization report: + + ``` + ninja report + ``` + If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use: + ``` + ninja report_s10_pac + ``` + * Compiling for FPGA hardware is not yet supported on Windows. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Examining the Reports +Locate `report.html` in the `n_way_buffering_report.prj/reports/` or `n_way_buffering_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. + +Note that because the optimization described in this tutorial takes place at the *runtime* level, the FPGA compiler report will not show a difference between the optimized and unoptimized cases. + + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./n_way_buffering.fpga_emu (Linux) + n_way_buffering.fpga_emu.exe (Windows) + ``` +2. Run the sample on the FPGA device: + ``` + ./n_way_buffering.fpga (Linux) + ``` + +### Example of Output + +``` +Platform name: Intel(R) FPGA SDK for OpenCL(TM) +Device name: pac_a10 : Intel PAC Platform (pac_ec00000) + + +Executing kernel 100 times in each round. + +*** Beginning execution, 1-way buffering, single-threaded host operations +Launching kernel #0 +Launching kernel #10 +Launching kernel #20 +Launching kernel #30 +Launching kernel #40 +Launching kernel #50 +Launching kernel #60 +Launching kernel #70 +Launching kernel #80 +Launching kernel #90 + +Overall execution time = 65915 ms +Total kernel-only execution time = 17852 ms +Throughput = 15.907802 MB/s + + +*** Beginning execution, 1-way buffering, multi-threaded host operations. +Launching kernel #0 +Launching kernel #10 +Launching kernel #20 +Launching kernel #30 +Launching kernel #40 +Launching kernel #50 +Launching kernel #60 +Launching kernel #70 +Launching kernel #80 +Launching kernel #90 + +Overall execution time = 51814 ms +Total kernel-only execution time = 17852 ms +Throughput = 20.237082 MB/s + + +*** Beginning execution, 2-way buffering, multi-threaded host operationss +Launching kernel #0 +Launching kernel #10 +Launching kernel #20 +Launching kernel #30 +Launching kernel #40 +Launching kernel #50 +Launching kernel #60 +Launching kernel #70 +Launching kernel #80 +Launching kernel #90 + +Overall execution time = 26109 ms +Total kernel-only execution time = 17852 ms +Throughput = 40.160442 MB/s + + +*** Beginning execution, N=5-way buffering, multi-threaded host operations +Launching kernel #0 +Launching kernel #10 +Launching kernel #20 +Launching kernel #30 +Launching kernel #40 +Launching kernel #50 +Launching kernel #60 +Launching kernel #70 +Launching kernel #80 +Launching kernel #90 + +Overall execution time with N-way buffering = 18763 ms +Total kernel-only execution time with N-way buffering = 17851 ms +Throughput = 55.884682 MB/s + + +Verification PASSED +``` + +### Discussion of Results + +A test compile of this tutorial design achieved an fMAX of approximately 340 MHz on the Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA. The results are shown in the following table: + +Configuration | Overall Execution Time (ms) | Total Kernel Execution time (ms) +-|-|- +1-way buffering, single-threaded | 64401 | 15187 +1-way buffering, multi-threaded | 53540 | 15187 +2-way buffering, multi-threaded | 27281 | 15187 +5-way buffering, multi-threaded | 16284 | 15188 + +In all runs, the total kernel execution time is similar, as expected. In the first three configurations, the overall execution time notably exceeds the total kernel execution time, implying there is downtime between kernel executions. However, as we switch from single-threaded to multi-threaded host operations and increase the number of buffer sets used, the overall execution time approaches the kernel execution time. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/downtime.png b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/downtime.png new file mode 100755 index 0000000000..2a306929bc Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/downtime.png differ diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.sln b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.sln new file mode 100755 index 0000000000..5a77b3049a --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "n_way_buffering", "n_way_buffering.vcxproj", "{49E7063B-56DA-4ACF-B153-5B56A98645BE}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {49E7063B-56DA-4ACF-B153-5B56A98645BE}.Debug|x64.ActiveCfg = Debug|x64 + {49E7063B-56DA-4ACF-B153-5B56A98645BE}.Debug|x64.Build.0 = Debug|x64 + {49E7063B-56DA-4ACF-B153-5B56A98645BE}.Release|x64.ActiveCfg = Release|x64 + {49E7063B-56DA-4ACF-B153-5B56A98645BE}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {CC320E26-0D79-434A-8E69-3F09BFB2FCF4} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.vcxproj new file mode 100755 index 0000000000..dff6f99529 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.vcxproj @@ -0,0 +1,160 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {49e7063b-56da-4acf-b153-5b56a98645be} + Win32Proj + n_way_buffering + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)n_way_buffering.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)n_way_buffering.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/sample.json new file mode 100755 index 0000000000..dffbded768 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/sample.json @@ -0,0 +1,51 @@ +{ + "guid": "2100C9BD-331C-475B-9878-4D14AAF0981D", + "name": "Overlapping Kernel Execution with Buffer Transfers and Host-Processing through N-Way Buffering", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "FPGA tutorial design to demonstrate overlapping kernel execution with buffer transfers and multi-threaded host-processing to improve system performance", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./n_way_buffering.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "n_way_buffering.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/CMakeLists.txt new file mode 100755 index 0000000000..cf12b30f72 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/CMakeLists.txt @@ -0,0 +1,93 @@ +set(SOURCE_FILE n_way_buffering.cpp) +set(TARGET_NAME n_way_buffering) +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR") +set(EMULATOR_LINK_FLAGS " -lpthread -fintelfpga") + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}") + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() +set(FPGA_OBJ_FILE "dev_fpga.o") + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + + add_custom_command(OUTPUT ${FPGA_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} -fintelfpga -c ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${FPGA_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${FPGA_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} ${FPGA_OBJ_FILE} -o ${CMAKE_BINARY_DIR}/${FPGA_TARGET} -lpthread + DEPENDS ${FPGA_OBJ_FILE}) +endif() + + +# report +if(WIN32) + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) + +else() + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/build.ninja new file mode 100755 index 0000000000..80284aff9b --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/build.ninja @@ -0,0 +1,30 @@ +source_file = n_way_buffering.cpp +target_name = n_way_buffering + +emulator_target = ${target_name}.fpga_emu.exe +report_target = ${target_name}_report.a +report_target_s10_pac = ${target_name}_s10_pac_report.a + +hardware_flags = -fintelfpga -Xshardware +emulator_flags = -fintelfpga -DFPGA_EMULATOR + +rule build_fpga_emu + command = dpcpp /GX ${emulator_flags} $in -o $out + +rule gen_report + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out + +rule gen_report_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu ${source_file} + +# report +build report: phony ${report_target} +build ${report_target}: gen_report ${source_file} + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_s10_pac} +build ${report_target_s10_pac}: gen_report_s10_pac ${source_file} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/n_way_buffering.cpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/n_way_buffering.cpp new file mode 100755 index 0000000000..c5428348db --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/n_way_buffering.cpp @@ -0,0 +1,437 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include +#include +#include +#include "dpc_common.hpp" + +using namespace sycl; + +// N-way buffering. N must be >= 1. +constexpr int kLocalN = 5; + +// # times to execute the kernel. kTimes must be >= kLocalN +#if defined(FPGA_EMULATOR) +constexpr int kTimes = 20; +#else +constexpr int kTimes = 100; +#endif + +// # of floats to process on each kernel execution. +#if defined(FPGA_EMULATOR) +constexpr int kSize = 4096; +#else +constexpr int kSize = 2621440; // ~10MB +#endif + +// Kernel executes a power function (base^kPow). Must be +// >= 2. Can increase this to increase kernel execution +// time, but ProcessOutput() time will also increase. +constexpr int kPow = 20; + +// Number of iterations through the main loop +constexpr int kNumRuns = 4; + +bool pass = true; + +class SimpleVpow; + +/* Kernel function. + Performs buffer_b[i] = buffer_a[i] ** pow + Only supports pow >= 2. + This kernel is not meant to be an optimal implementation of the power + operation -- it's just a sample kernel for this tutorial whose execution time + is easily controlled via the pow parameter. SYCL buffers are created + externally and passed in by reference to control (external to this function) + when the buffers are destructed. The destructor causes a blocking buffer + transfer from device to host and N-way buffering requires us to not block + here (because we need to queue more kernels). So we only want this transfer + to occur at the end of overall execution, not at the end of each individual + kernel execution. +*/ +void SimplePow(std::unique_ptr &q, buffer &buffer_a, + buffer &buffer_b, event &e) { + // Submit to the queue and execute the kernel + e = q->submit([&](handler &h) { + // Get kernel access to the buffers + auto accessor_a = buffer_a.get_access(h); + auto accessor_b = buffer_b.get_access(h); + + const int num = kSize; + const int p = kPow - 1; // Assumes pow >= 2; + assert(kPow >= 2); + + h.single_task([=]() [[intel::kernel_args_restrict]] { + for (int j = 0; j < p; j++) { + if (j == 0) { + for (int i = 0; i < num; i++) { + accessor_b[i] = accessor_a[i] * accessor_a[i]; + } + } else { + for (int i = 0; i < num; i++) { + accessor_b[i] = accessor_b[i] * accessor_a[i]; + } + } + } + }); + }); + + event update_host_event; + update_host_event = q->submit([&](handler &h) { + auto accessor_b = buffer_b.get_access(h); + + /* + Explicitly instruct the SYCL runtime to copy the kernel's output buffer + back to the host upon kernel completion. This is not required for + functionality since the buffer access in ProcessOutput() also implicitly + instructs the runtime to copy the data back. But it should be noted that + this buffer access blocks ProcessOutput() until the kernel is complete + and the data is copied. In contrast, update_host() instructs the runtime + to perform the copy earlier. This allows ProcessOutput() to optionally + perform more useful work *before* making the blocking buffer access. Said + another way, this allows ProcessOutput() to potentially perform more work + in parallel with the runtime's copy operation. + */ + h.update_host(accessor_b); + }); + +} + +// Returns kernel execution time for a given SYCL event from a queue. +ulong SyclGetExecTimeNs(event e) { + ulong start_time = + e.get_profiling_info(); + ulong end_time = + e.get_profiling_info(); + return (end_time - start_time); +} + +// Local pow function for verifying results +float MyPow(float input, int pow) { + return (pow == 0) ? 1 : input * MyPow(input, pow - 1); +} + +/* Compares kernel output against expected output. + Grabs kernel output data from its SYCL buffer. Reading from this buffer is a + blocking operation that will block on the kernel completing. Grabs expected + output from a host-side copy of the input data. A copy is used to allow for + parallel generation of the input data for the next execution. Queries and + records execution time of the kernel that just completed. This is a natural + place to do this because ProcessOutput() is blocked on kernel completion. +*/ +void ProcessOutput(buffer &output_buf, + std::vector &input_copy, int exec_number, event e, + ulong &total_kernel_time_per_slot) { + auto output_buf_acc = output_buf.get_access(); + int num_errors = 0; + int num_errors_to_print = 10; + + /* The use of update_host() in the kernel function allows for additional + host-side operations to be performed here, in parallel with the buffer copy + operation from device to host, before the blocking access to the output + buffer is made via output_buf_acc[]. To be clear, no real operations are + done here and this is just a note that this is the place + where you *could* do it. */ + for (int i = 0; i < kSize; i++) { + bool out_valid = (MyPow(input_copy.data()[i], kPow) != output_buf_acc[i]); + if ((num_errors < num_errors_to_print) && out_valid) { + if (num_errors == 0) { + pass = false; + std::cout << "Verification failed on kernel execution # " << exec_number + << ". Showing up to " << num_errors_to_print + << " mismatches.\n"; + } + std::cout << "Verification failed on kernel execution # " << exec_number + << ", at element " << i << ". Expected " << std::fixed + << std::setprecision(16) << MyPow(input_copy.data()[i], kPow) + << " but got " << output_buf_acc[i] << "\n"; + num_errors++; + } + } + + // At this point we know the kernel has completed, so can query the profiling + // data. + total_kernel_time_per_slot += SyclGetExecTimeNs(e); +} + +/* + Generates input data for the next kernel execution. + Writes the data into the associated SYCL buffer. The write will block until + the previous kernel execution, that is using this buffer, completes. Writes a + copy of the data into a host-side buffer that will later be used by + ProcessOutput(). +*/ +void ProcessInput(buffer &buf, std::vector ©) { + // We are generating completely new input data, so can use discard_write() + // here to indicate we don't care about the SYCL buffer's current contents. + auto buf_acc = buf.get_access(); + + // RNG seed + auto seed = std::chrono::system_clock::now().time_since_epoch().count(); + + // RNG engine + std::default_random_engine dre(seed); + + // Values between 1 and 2 + std::uniform_real_distribution di(1.0f, 2.0f); + + // Randomly generate a start value and increment from there. + // Compared to randomly generating every value, this is done to + // speed up this function a bit. + float start_val = di(dre); + + for (int i = 0; i < kSize; i++) { + buf_acc[i] = start_val; + copy.data()[i] = start_val; + start_val++; + } +} + +int main() { +// Create queue, get platform and device +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector device_selector; + std::cout << "\nEmulator output does not demonstrate true hardware " + "performance. The design may need to run on actual hardware " + "to observe the performance benefit of the optimization " + "exemplified in this tutorial.\n\n"; +#else + intel::fpga_selector device_selector; +#endif + + try { + auto prop_list = + property_list{property::queue::enable_profiling()}; + + std::unique_ptr q; + q.reset(new queue(device_selector, dpc_common::exception_handler, prop_list)); + + platform platform = q->get_context().get_platform(); + device device = q->get_device(); + std::cout << "Platform name: " + << platform.get_info().c_str() << "\n"; + std::cout << "Device name: " + << device.get_info().c_str() << "\n\n\n"; + + std::cout << "Executing kernel " << kTimes << " times in each round.\n\n"; + + // Create a vector to store the input/output SYCL buffers + std::vector> input_buf; + std::vector> output_buf; + + // For every execution slot, we need 2 host-side buffers + // to store copies of the input data. One is used to + // verify the previous kernel's output. The other stores + // the new data for the next kernel execution. + std::vector input_buf_copy[2 * kLocalN]; + + // SYCL events for each kernel launch. + event sycl_events[kLocalN]; + + // In nanoseconds. Total execution time of kernels in a given slot. + ulong total_kernel_time_per_slot[kLocalN]; + + // Total execution time of all kernels. + ulong total_kernel_time = 0; + + // Threads to process the output from each kernel + std::thread t_process_output[kLocalN]; + + // Threads to process the input data for the next kernel + std::thread t_process_input[kLocalN]; + + // Demonstrate with 1-way buffering first, then N-way buffering. + int N; + + // st = "single threaded". + // Used to enable multi-threading in subsequent runs. + bool st = true; + + // Allocate vectors to store the host-side copies of the input data + for (int i = 0; i < 2 * kLocalN; i++) { + input_buf_copy[i] = std::vector(kSize); + } + + // Create and allocate the SYCL buffers + for (int i = 0; i < kLocalN; i++) { + input_buf.push_back(buffer(range<1>(kSize))); + output_buf.push_back(buffer(range<1>(kSize))); + } + + /* + Main loop. + This loop runs multiple times to demonstrate how performance can be + improved by increasing the number of buffers as well as multi-threading + the host-side operations. The first iteration is a base run, demonstrating + the performance with none of these optimizations (ie. 1-way buffering, + single-threaded). + */ + for (int i = 0; i < kNumRuns; i++) { + for (int i = 0; i < kLocalN; i++) { + total_kernel_time_per_slot[i] = 0; // Initialize timers to zero. + } + + switch (i) { + case 0: { + std::cout << "*** Beginning execution, 1-way buffering, " + "single-threaded host operations\n"; + N = 1; + st = true; + break; + } + case 1: { + std::cout << "*** Beginning execution, 1-way buffering, " + "multi-threaded host operations.\n"; + N = 1; + st = false; + break; + } + case 2: { + std::cout << "*** Beginning execution, 2-way buffering, " + "multi-threaded host operationss\n"; + N = 2; + st = false; + break; + } + case 3: { + std::cout << "*** Beginning execution, N=" << kLocalN + << "-way buffering, multi-threaded host operations\n"; + N = kLocalN; + st = false; + break; + } + default: + std::cout << "*** Beginning execution.\n"; + } + + // Start the timer. This will include the time to process the + // input data for the first N kernel executions. + dpc_common::TimeInterval exec_time; + + // Process the input data for first N kernel executions. For + // multi-threaded runs, this is done in parallel. + for (int i = 0; i < N; i++) { + t_process_input[i] = std::thread(ProcessInput, std::ref(input_buf[i]), + std::ref(input_buf_copy[i])); + if (st) { + t_process_input[i].join(); + } + } + + /* + It's useful to think of the kernel execution space as having N slots. + Conceptually, the slots are executed chronologically sequentially on the + device (i.e. slot 0 to N-1). Each slot has its own buffering on both the + host and device. Before launching a kernel in a given slot, we must + process output data from the previous execution that occurred in that + slot and process new input data for the upcoming new execution in that + slot. + */ + for (int i = 0; i < kTimes; i++) { + // The current slot is i%N. + // Before each kernel launch, the ProcessOutput() must have completed + // for the last execution in this slot. The ProcessInput() must also + // have completed for the upcoming new execution for this slot. Block on + // both of these. + if (!st) { + // ProcessOutput() is only relevant after the + // first N kernels have been launched. + if (i >= N) { + t_process_output[i % N].join(); + } + + t_process_input[i % N].join(); + } + + // Launch the kernel. This is non-blocking with respect to main(). + // Only print every few iterations, just to limit the prints. + if (i % 10 == 0) { + std::cout << "Launching kernel #" << i << "\n"; + } + + SimplePow(q, input_buf[i % N], output_buf[i % N], sycl_events[i % N]); + + // Immediately launch threads for the ProcessOutput() and + // ProcessInput() for *this* slot. These are non-blocking with respect + // to main(), but they will individually be blocked until the + // corresponding kernel execution is complete. The ProcessOutput() + // compares the kernel output data against the input data. But + // ProcessInput() will be overwriting that input data in parallel. + // Therefore ProcessOutput() must compare against an older copy of the + // data. We ping-pong between host-side copies of the input data. + t_process_output[i % N] = std::thread( + ProcessOutput, std::ref(output_buf[i % N]), + std::ref(input_buf_copy[i % (2 * N)]), i, sycl_events[i % N], + std::ref(total_kernel_time_per_slot[i % N])); + + // For single-threaded runs, force single-threaded operation by + // blocking here immediately. + if (st) { + t_process_output[i % N].join(); + } + + // For the final N kernel launches, no need to process + // input data because there will be no more launches. + if (i < kTimes - N) { + // The indexes for the input_buf_copy used by ProcessOutput() and + // ProcessInput() are spaced N apart. + t_process_input[i % N] = + std::thread(ProcessInput, std::ref(input_buf[i % N]), + std::ref(input_buf_copy[(i + N) % (2 * N)])); + + if (st) { + t_process_input[i % N].join(); + } + } + } + + // Wait for the final N threads to finish and add up the overall kernel + // execution time. + total_kernel_time = 0; + for (int i = 0; i < N; i++) { + if (!st) { + t_process_output[i].join(); + } + total_kernel_time += total_kernel_time_per_slot[i]; + } + + // Stop the timer. + double time_span = exec_time.Elapsed(); + + std::cout << "\nOverall execution time " + << ((i == kNumRuns - 1) ? ("with N-way buffering ") : "") + << "= " << (unsigned)(time_span * 1000) << " ms\n"; + std::cout << "Total kernel-only execution time " + << ((i == kNumRuns - 1) ? ("with N-way buffering ") : "") + << "= " << (unsigned)(total_kernel_time / 1000000) << " ms\n"; + std::cout << "Throughput = " << std::setprecision(8) + << (float)kSize * (float)kTimes * (float)sizeof(float) / + (float)time_span / 1000000 + << " MB/s\n\n\n"; + } + if (pass) { + std::cout << "Verification PASSED\n"; + } else { + std::cout << "Verification FAILED\n"; + return 1; + } + } catch (sycl::exception const& e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/CMakeLists.txt new file mode 100755 index 0000000000..4835f73b5f --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(LocalMemoryCache) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/README.md b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/README.md new file mode 100755 index 0000000000..8a974787e4 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/README.md @@ -0,0 +1,189 @@ +# Caching On-Chip Memory to Improve Loop Performance +This FPGA tutorial demonstrates how to build a simple cache (implemented in FPGA registers) to store recently-accessed memory locations so that the compiler can achieve II=1 on critical loops in task kernels. + + +***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | How and when to implement the on-chip memory cache optimization +| Time to complete | 30 minutes + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + +## Purpose +In DPC++ task kernels for FPGA, it is always our objective to achieve an initiation interval (II) of 1 on performance-critical loops. This means that a new loop iteration is launched on every clock cycle, maximizing the throughput of the loop. + +When the loop contains a loop-carried variable that is implemented in on-chip memory, the compiler often *cannot* achieve II=1 because the memory access takes more than one clock cycle. If the updated memory location may be needed on the next loop iteration, the next iteration must be delayed to allow time for the update, hence II > 1. + +The on-chip memory cache technique breaks this dependency by storing recently-accessed values in a cache capable of a 1-cycle read-modify-write operation. The cache is implemented in FPGA registers rather than on-chip memory. By pulling memory accesses preferentially from the register cache, the loop-carried dependency is broken. + +### When is the on-chip memory cache technique applicable? + +***Failure to achieve II=1 because of a loop-carried memory dependency in on-chip memory***: +The on-chip memory cache technique is applicable if compiler could not pipeline a loop with II=1 because of an on-chip memory dependency. (If the compiler could not achieve II=1 because of a *global* memory dependency, this technique does not apply as the access latencies are too great.) + +To check this for a given design, view the "Loops Analysis" section of its optimization report. The report lists the II of all loops and explains why a lower II is not achievable. Check whether the reason given resembles "the compiler failed to schedule this loop with smaller II due to memory dependency". The report will describe the "most critical loop feedback path during scheduling". Check whether this includes on-chip memory load/store operations on the critical path. + +***An II=1 loop with a load operation of latency 1***: +The compiler is capable of reducing the latency of on-chip memory accesses in order to achieve II=1. However, in doing so the compiler makes a trade-off, sacrificing fMAX to better optimize the loop. + +In a design with II=1 critical loops but lower than desired fMAX, the on-chip memory cache technique may still be applicable. It can help recover fMAX by enabling the compiler to achieve II=1 with a higher latency memory access. + +To check whether this is the case for a given design, view the "Kernel Memory Viewer" section of the optimization report. Select the on-chip memory of interest from the Kernel Memory List, and mouse over the load operation "LD" to check its latency. If the latency of the load operation is 1, this is a clear sign that the compiler has attempted to sacrifice fMAX to better optimize a loop. + + +### Implementing the on-chip memory cache technique + +The tutorial demonstrates the technique using a program that computes a histogram. The histogram operation accepts an input vector of values, separates the values into buckets, and counts the number of values per bucket. For each input value, an output bucket location is determined, and the count for the bucket is incremented. This count is stored in the on-chip memory and the increment operation requires reading from the memory, performing the increment, and storing the result. This read-modify-write operation is the critical path that can result in II > 1. + +To reduce II, the idea is to store recently-accessed values in an FPGA register-implemented cache that is capable of a 1-cycle read-modify-write operation. If the memory location required on a given iteration exists in the cache, it is pulled from there. The updated count is written back to *both* the cache and the on-chip memory. The `ivdep` pragma is added to inform the compiler that if a loop-carried variable (namely, the variable storing the histogram output) is needed within `CACHE_DEPTH` iterations, it is guaranteed to be available right away. + +### Selecting the cache depth + +While any value of `CACHE_DEPTH` results in functional hardware, the ideal value of `CACHE_DEPTH` requires some experimentation. The depth of the cache needs to roughly cover the latency of the on-chip memory access. To determine the correct value, it is suggested to start with a value of 2 and then increase it until both II = 1 and load latency > 1. In this tutorial, a `CACHE_DEPTH` of 5 is needed. + +Each iteration takes only a few moments by running `make report` (refer to the section below on how to build the design). It is important to find the *minimal* value of `CACHE_DEPTH` that results in a maximal performance increase. Unnecessarily large values of `CACHE_DEPTH` consume unnecessary FPGA resources and can reduce fMAX. Therefore, at a `CACHE_DEPTH` that results in II=1 and load latency = 1, if further increases to `CACHE_DEPTH` show no improvement, then `CACHE_DEPTH` should not be increased any further. + +In the tutorial, two versions of the histogram kernel are implemented: one with and one without caching. The report shows II > 1 for the loop in the kernel without caching and II = 1 for the one with caching. + +## Key Concepts +* How to implement the on-chip memory cache optimization technique +* The scenarios in which this technique benefits performance +* How to tune the cache depth + +## License +This code sample is licensed under MIT license. + + +## Building the `onchip_memory_cache` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + +### On a Windows* System +Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. + +1. Enter the source file directory. + ``` + cd src + ``` + +2. Compile the design. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + ninja fpga_emu + ``` + + * Generate the optimization report: + + ``` + ninja report + ``` + If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use: + ``` + ninja report_s10_pac + ``` + * Compiling for FPGA hardware is not yet supported on Windows. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + + +## Examining the Reports +Locate `report.html` in the `onchip_memory_cache_report.prj/reports/` or `onchip_memory_cache_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. + +Compare the Loop Analysis reports with and without the onchip memory cache optimization, as described in the "When is the on-chip memory cache technique applicable?" section. + + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./onchip_memory_cache.fpga_emu (Linux) + onchip_memory_cache.fpga_emu.exe (Windows) + ``` +2. Run the sample on the FPGA device: + ``` + ./onchip_memory_cache.fpga (Linux) + ``` + +### Example of Output + +``` +Platform name: Intel(R) FPGA SDK for OpenCL(TM) +Device name: pac_a10 : Intel PAC Platform (pac_ee00000) + + +Number of inputs: 16777216 +Number of outputs: 64 + +Beginning run without local memory caching. + +Verification PASSED + +Kernel execution time: 0.114106 seconds +Kernel throughput without caching: 560.884047 MB/s + +Beginning run with local memory caching. + +Verification PASSED + +Kernel execution time: 0.059061 seconds +Kernel throughput with caching: 1083.623184 MB/s +``` + +### Discussion of Results + +A test compile of this tutorial design achieved an fMAX of approximately 250 MHz on the Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA. The results are shown in the following table: + +Configuration | Execution Time (ms) | Throughput (MB/s) +-|-|- +Without caching | 0.153 | 418 +With caching | 0.08 | 809 + +When caching is used, performance notably increases. As previously mentioned, this technique should result in an II reduction, which should lead to a throughput improvement. The technique can also improve fMAX if the compiler had previously implemented a latency=1 load operation, in which case the fMAX increase should result in a further throughput improvement. + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.sln b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.sln new file mode 100755 index 0000000000..3df819f016 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "onchip_memory_cache", "onchip_memory_cache.vcxproj", "{66A01391-21D2-46BB-A37A-6B8670BEE1FC}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {66A01391-21D2-46BB-A37A-6B8670BEE1FC}.Debug|x64.ActiveCfg = Debug|x64 + {66A01391-21D2-46BB-A37A-6B8670BEE1FC}.Debug|x64.Build.0 = Debug|x64 + {66A01391-21D2-46BB-A37A-6B8670BEE1FC}.Release|x64.ActiveCfg = Release|x64 + {66A01391-21D2-46BB-A37A-6B8670BEE1FC}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {E3206292-E99D-4ADC-B428-E0557E8070D4} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.vcxproj new file mode 100755 index 0000000000..940683894e --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.vcxproj @@ -0,0 +1,160 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {66a01391-21d2-46bb-a37a-6b8670bee1fc} + Win32Proj + onchip_memory_cache + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)onchip_memory_cache.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)onchip_memory_cache.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/sample.json new file mode 100755 index 0000000000..a35ba679ac --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/sample.json @@ -0,0 +1,51 @@ +{ + "guid": "93DA332C-5490-4E4B-8038-BDEC1662A2D0", + "name": "Caching On-Chip Memory to Improve Loop Performance", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "FPGA tutorial demonstrating the caching of on-chip memory to reduce loop initiation interval.", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "builder": ["ide", "cmake"], + "targetDevice": ["FPGA"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./onchip_memory_cache.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "onchip_memory_cache.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/CMakeLists.txt new file mode 100755 index 0000000000..9ed3cee584 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/CMakeLists.txt @@ -0,0 +1,89 @@ +set(SOURCE_FILE onchip_memory_cache.cpp) +set(TARGET_NAME onchip_memory_cache) + +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS "-fintelfpga") + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR") +set(EMULATOR_LINK_FLAGS "-fintelfpga") + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS}) + set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) +endif() + +# generate report +if(WIN32) + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) + +else() + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/build.ninja new file mode 100755 index 0000000000..94d90e092c --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/build.ninja @@ -0,0 +1,30 @@ +source_file = onchip_memory_cache.cpp +target_name = onchip_memory_cache + +emulator_target = ${target_name}.fpga_emu.exe +report_target = ${target_name}_report.a +report_target_s10_pac = ${target_name}_s10_pac_report.a + +hardware_flags = -fintelfpga -Xshardware +emulator_flags = -fintelfpga -DFPGA_EMULATOR + +rule build_fpga_emu + command = dpcpp /GX ${emulator_flags} $in -o $out + +rule gen_report + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out + +rule gen_report_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu ${source_file} + +# report +build report: phony ${report_target} +build ${report_target}: gen_report ${source_file} + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_s10_pac} +build ${report_target_s10_pac}: gen_report_s10_pac ${source_file} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/onchip_memory_cache.cpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/onchip_memory_cache.cpp new file mode 100755 index 0000000000..83b48eac97 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/onchip_memory_cache.cpp @@ -0,0 +1,235 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include +#include "dpc_common.hpp" + +using namespace sycl; + +constexpr int kInitNumInputs = 16 * 1024 * 1024; // Default number of inputs. +constexpr int kNumOutputs = 64; // Number of outputs +constexpr int kInitSeed = 42; // Seed for randomizing data inputs +constexpr int kCacheDepth = 5; // Depth of the cache. +constexpr int kNumRuns = 2; // runs twice to show the impact of cache +constexpr double kNs = 1000000000.0; // number of nanoseconds in a second + +template +class Task; + +// This kernel function implements two data paths: with and without caching. +// use_cache specifies which path to take. +template +void Histogram(std::unique_ptr& q, buffer& input_buf, + buffer& output_buf, event& e) { + // Enqueue kernel + e = q->submit([&](handler& h) { + // Get accessors to the SYCL buffers + auto input = input_buf.get_access(h); + auto output = output_buf.get_access(h); + + h.single_task>([=]() [[intel::kernel_args_restrict]] { + + // On-chip memory for Histogram + uint32_t local_output[kNumOutputs]; + uint32_t local_output_with_cache[kNumOutputs]; + + // Register-based cache of recently-accessed memory locations + uint32_t last_sum[kCacheDepth + 1]; + uint32_t last_sum_index[kCacheDepth + 1]; + + // Initialize Histogram to zero + for (uint32_t b = 0; b < kNumOutputs; ++b) { + local_output[b] = 0; + local_output_with_cache[b] = 0; + } + + // Compute the Histogram + if (!use_cache) { // Without cache + for (uint32_t n = 0; n < kInitNumInputs; ++n) { + // Compute the Histogram index to increment + uint32_t b = input[n] % kNumOutputs; + local_output[b]++; + } + } else { // With cache + + // Specify that the minimum dependence-distance of + // loop carried variables is kCacheDepth. + [[intelfpga::ivdep(kCacheDepth)]] for (uint32_t n = 0; + n < kInitNumInputs; ++n) { + // Compute the Histogram index to increment + uint32_t b = input[n] % kNumOutputs; + + // Get the value from the on-chip mem at this index. + uint32_t val = local_output_with_cache[b]; + + // However, if this location in on-chip mem was recently + // written to, take the value from the cache. + #pragma unroll + for (int i = 0; i < kCacheDepth + 1; i++) { + if (last_sum_index[i] == b) val = last_sum[i]; + } + + // Write the new value to both the cache and the on-chip mem. + last_sum[kCacheDepth] = local_output_with_cache[b] = val + 1; + last_sum_index[kCacheDepth] = b; + + // Cache is just a shift register, so shift the shift reg. Pushing + // into the back of the shift reg is done above. + #pragma unroll + for (int i = 0; i < kCacheDepth; i++) { + last_sum[i] = last_sum[i + 1]; + last_sum_index[i] = last_sum_index[i + 1]; + } + } + } + + // Write output to global memory + for (uint32_t b = 0; b < kNumOutputs; ++b) { + if (!use_cache) { + output[b] = local_output[b]; + } else { + output[b] = local_output_with_cache[b]; + } + } + }); + }); +} + +int main() { + // Host and kernel profiling + event e; + ulong t1_kernel, t2_kernel; + double time_kernel; + +// Create queue, get platform and device +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector device_selector; + std::cout << "\nEmulator output does not demonstrate true hardware " + "performance. The design may need to run on actual hardware " + "to observe the performance benefit of the optimization " + "exemplified in this tutorial.\n\n"; +#else + intel::fpga_selector device_selector; +#endif + try { + auto prop_list = + property_list{property::queue::enable_profiling()}; + + std::unique_ptr q; + q.reset(new queue(device_selector, dpc_common::exception_handler, prop_list)); + + platform platform = q->get_context().get_platform(); + device device = q->get_device(); + std::cout << "Platform name: " + << platform.get_info().c_str() << "\n"; + std::cout << "Device name: " + << device.get_info().c_str() << "\n\n\n"; + + std::cout << "\nNumber of inputs: " << kInitNumInputs << "\n"; + std::cout << "Number of outputs: " << kNumOutputs << "\n\n"; + + // Create input and output buffers + auto input_buf = buffer(range<1>(kInitNumInputs)); + auto output_buf = buffer(range<1>(kNumOutputs)); + + srand(kInitSeed); + + // Compute the reference solution + uint32_t gold[kNumOutputs]; + + { + // Get host-side accessors to the SYCL buffers + auto input_host = input_buf.get_access(); + // Initialize random input + for (int i = 0; i < kInitNumInputs; ++i) { + input_host[i] = rand(); + } + + for (int b = 0; b < kNumOutputs; ++b) { + gold[b] = 0; + } + for (int i = 0; i < kInitNumInputs; ++i) { + int b = input_host[i] % kNumOutputs; + gold[b]++; + } + } + + // Host accessor is now out-of-scope and is destructed. This is required + // in order to unblock the kernel's subsequent accessor to the same buffer. + + for (int i = 0; i < kNumRuns; i++) { + switch (i) { + case 0: { + std::cout << "Beginning run without on-chip memory caching.\n\n"; + Histogram(q, input_buf, output_buf, e); + break; + } + case 1: { + std::cout << "Beginning run with on-chip memory caching.\n\n"; + Histogram(q, input_buf, output_buf, e); + break; + } + default: { + Histogram(q, input_buf, output_buf, e); + } + } + + // Wait for kernels to finish + q->wait(); + + // Compute kernel execution time + t1_kernel = e.get_profiling_info(); + t2_kernel = e.get_profiling_info(); + time_kernel = (t2_kernel - t1_kernel) / kNs; + + // Get accessor to output buffer. Accessing the buffer at this point in + // the code will block on kernel completion. + auto output_host = output_buf.get_access(); + + // Verify output and print pass/fail + bool passed = true; + int num_errors = 0; + for (int b = 0; b < kNumOutputs; b++) { + if (num_errors < 10 && output_host[b] != gold[b]) { + passed = false; + std::cout << " (mismatch, expected " << gold[b] << ")\n"; + num_errors++; + } + } + + if (passed) { + std::cout << "Verification PASSED\n\n"; + + // Report host execution time and throughput + std::cout.setf(std::ios::fixed); + double N_MB = (kInitNumInputs * sizeof(uint32_t)) / + (1024 * 1024); // Input size in MB + + // Report kernel execution time and throughput + std::cout << "Kernel execution time: " << time_kernel << " seconds\n"; + std::cout << "Kernel throughput " << (i == 0 ? "without" : "with") + << " caching: " << N_MB / time_kernel << " MB/s\n\n"; + } else { + std::cout << "Verification FAILED\n"; + return 1; + } + } + } catch (sycl::exception const& e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/CMakeLists.txt new file mode 100755 index 0000000000..09e703741b --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(PipeArray) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/README.md b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/README.md new file mode 100755 index 0000000000..d292d6465f --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/README.md @@ -0,0 +1,215 @@ + +# Data Transfers Using Pipe Arrays +This FPGA tutorial showcases a design pattern that makes it possible to create arrays of pipes. + +***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | A design pattern to generate a array of pipes in DPC++
Static loop unrolling through template metaprogramming +| Time to complete | 15 minutes + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + +## Purpose +In certain situations, it is useful to be able to create collection of pipes that can be indexed like an array in a DPC++ FPGA design. If you are not yet familiar with DPC++ pipes, refer to the prerequisite tutorial "Data Transfers Using Pipes". + +In SYCL*, each pipe defines a unique type with static methods for reading data (`read`) and writing data (`write`). Since pipes are not objects but *types*, defining a collection of pipes requires C++ template meta-programming. This is somewhat non-intuitive but yields highly efficient code. + +This tutorial provides a convenient pair of header files defining an abstraction for an array of pipes. The headers can be used in any DPC++ design and can be extended as necessary. + +### Example 1: A simple array of pipes + +To create an array of pipes, include the top-level header (from this code sample) in your design: + +```c++ +#include "pipe_array.hpp" +``` + +As with regular pipes, an array of pipes needs template parameters for an ID, for the `min_capacity` of each pipe, and for the data type of each pipe. An array of pipes additionally requires one or more template parameters to specify the array size. The following code declares a one dimensional array of 10 pipes, each with `capacity=32`, that operate on `int` values. + +```c++ +using MyPipeArray = PipeArray< // Defined in "pipe_array.h". + class MyPipe, // An identifier for the pipe. + int, // The type of data in the pipe. + 32, // The capacity of each pipe. + 10, // array dimension. + >; +``` + +The uniqueness of a pipe array is derived from a combination of all template parameters. + +Indexing inside a pipe array can be done via the `PipeArray::PipeAt` type alias, as shown in the following code snippet: + +```c++ +MyPipeArray::PipeAt<3>::write(17); +auto x = MyPipeArray::PipeAt<3>::read(); +``` +The template parameter `<3>` identifies a specific pipe within the array of pipes. The index of the pipe being accessed *must* be determinable at compile time. + +In most cases, we want to use an array of pipes so that we can iterate over them in a loop. In order to respect the requirement that all pipe indices are uniquely determinable at compile time, we must use a static form of loop unrolling based on C++ templates. A simple example is shown in the code snippet: + +```c++ +// Write 17 to every pipe in the array +Unroller<0, 10>::Step([](auto i) { + MyPipeArray::PipeAt::write(17); +}); +``` +While this may initially feel foreign to those unaccustomed to C++ template metaprogramming, this is a simple and powerful pattern common to many C++ libraries. It is easy to reuse. In addition to `pipe_array.hpp`, this code sample includes a simple header file `unroller.hpp`, which implements the `Unroller` functionality. + +### Example 2: A 2D array of pipes + +This code sample defines a `Producer` kernel that reads data from host memory and forwards this data into a two dimensional pipe matrix. + +The following code snippet creates a two dimensional pipe array. +``` c++ +constexpr size_t kNumRows = 2; +constexpr size_t kNumCols = 2; +constexpr size_t kDepth = 2; + +using ProducerToConsumerPipeMatrix = PipeArray< // Defined in "pipe_array.h". + class ProducerConsumerPipe, // An identifier for the pipe. + uint64_t, // The type of data in the pipe. + kDepth, // The capacity of each pipe. + kNumRows, // array dimension. + kNumCols // array dimension. + >; +``` +The producer kernel writes `num_passes` units of data into each of the `kNumRows * kNumCols` pipes. Note that the unrollers' lambdas must capture certain variables from their outer scope. + +```c++ +h.single_task([=]() { + size_t input_idx = 0; + for (size_t pass = 0; pass < num_passes; pass++) { + // Template-based unroll (outer "i" loop) + Unroller<0, kNumRows>::Step([&input_idx, input_accessor](auto i) { + // Template-based unroll (inner "j" loop) + Unroller<0, kNumCols>::Step([&input_idx, i, input_accessor](auto j) { + // Write a value to the pipe of the pipe array + ProducerToConsumerPipeMatrix::PipeAt::write( + input_accessor[input_idx++]); + }); + }); + } +}); +``` + +The code sample also defines an array of `Consumer` kernels that each read from a unique pipe in `ProducerToConsumerPipeMatrix`, process the data, and write the result to the host memory. + +```c++ +// The consumer kernel reads from a single pipe, determined by consumer_id +h.single_task>([=]() { + constexpr size_t x = consumer_id / kNumCols; + constexpr size_t y = consumer_id % kNumCols; + for (size_t i = 0; i < num_elements; ++i) { + auto input = ProducerToConsumerPipeMatrix::PipeAt::read(); + uint64_t answer = ConsumerWork(input); // do some processing + output_accessor[i] = answer; + } +}); +``` + +The host must thus enqueue the producer kernel and `kNumRows * kNumCols` separate consumer kernels. The latter is achieved through another static unroll. +```c++ +{ + queue q(device_selector, dpc_common::exception_handler); + + // Enqueue producer + buffer producer_buffer(producer_input); + Producer(q, producer_buffer); + + // Use template-based unroll to enqueue multiple consumers + std::vector> consumer_buffers; + Unroller<0, kNumberOfConsumers>::Step([&](auto consumer_id) { + consumer_buffers.emplace_back(consumer_output[consumer_id].data(), items_per_consumer); + Consumer(q, consumer_buffers.back()); + }); +} +``` + +## Key Concepts +* A design pattern to generate a array of pipes in DPC++ +* Static loop unrolling through template metaprogramming + +## License +This code sample is licensed under MIT license. + + +## Building the `pipe_array` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + + +## Examining the Reports +Locate `report.html` in the `pipe_array_report.prj/reports/` or `pipe_array_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. + +You can visualize the kernels and pipes generated by looking at the "System Viewer" section of the report. However, it is recommended that you first reduce the array dimensions `kNumRows` and `kNumCols` to small values (2 or 3) to facilitate visualization. + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./pipe_array.fpga_emu (Linux) + pipe_array.fpga_emu.exe (Windows) + ``` +2. Run the sample on the FPGA device: + ``` + ./pipe_array.fpga (Linux) + ``` + +### Example of Output +``` +Input Array Size: 1024 +Enqueuing producer... +Enqueuing consumer 0... +Enqueuing consumer 1... +Enqueuing consumer 2... +Enqueuing consumer 3... +PASSED: The results are correct +``` diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/pipe_array.sln b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/pipe_array.sln new file mode 100755 index 0000000000..efb4ff761f --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/pipe_array.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "pipe_array", "pipe_array.vcxproj", "{FA3FB2D1-BA98-4B4E-A8FA-A9BE6F8CA204}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {FA3FB2D1-BA98-4B4E-A8FA-A9BE6F8CA204}.Debug|x64.ActiveCfg = Debug|x64 + {FA3FB2D1-BA98-4B4E-A8FA-A9BE6F8CA204}.Debug|x64.Build.0 = Debug|x64 + {FA3FB2D1-BA98-4B4E-A8FA-A9BE6F8CA204}.Release|x64.ActiveCfg = Release|x64 + {FA3FB2D1-BA98-4B4E-A8FA-A9BE6F8CA204}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {936BD366-28EA-4A45-B5CF-EE6630694F28} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/pipe_array.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/pipe_array.vcxproj new file mode 100755 index 0000000000..5ebc0c86e4 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/pipe_array.vcxproj @@ -0,0 +1,165 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + + + + + + 15.0 + {fa3fb2d1-ba98-4b4e-a8fa-a9be6f8ca204} + Win32Proj + pipe_array + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)pipe_array.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)pipe_array.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/sample.json new file mode 100755 index 0000000000..047514cfcc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/sample.json @@ -0,0 +1,51 @@ +{ + "guid": "11A61AF6-727E-4241-B5A0-CCCD0EF160B9", + "name": "Data Transfers Using Pipe Arrays", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "FPGA tutorial showcasing a design pattern to enables the creation of arrays of pipes.", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./pipe_array.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "pipe_array.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/CMakeLists.txt new file mode 100755 index 0000000000..0301dbed55 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/CMakeLists.txt @@ -0,0 +1,91 @@ +set(SOURCE_FILE pipe_array.cpp) +set(TARGET_NAME pipe_array) +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS "-fintelfpga") + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR") +set(EMULATOR_LINK_FLAGS "-fintelfpga") + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) + +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS}) + set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) +endif() + +# report +if(WIN32) + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) + +else() + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/pipe_array.hpp pipe_array.hpp COPYONLY) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/unroller.hpp unroller.hpp COPYONLY) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/pipe_array_internal.hpp pipe_array_internal.hpp COPYONLY) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE} pipe_array.hpp unroller.hpp pipe_array_internal.hpp) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/build.ninja new file mode 100755 index 0000000000..3ea2cc86e1 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/build.ninja @@ -0,0 +1,30 @@ +source_file = pipe_array.cpp +target_name = pipe_array + +emulator_target = ${target_name}.fpga_emu.exe +report_target = ${target_name}_report.a +report_target_s10_pac = ${target_name}_s10_pac_report.a + +hardware_flags = -fintelfpga -Xshardware -std=c++14 +emulator_flags = -fintelfpga -DFPGA_EMULATOR -std=c++14 + +rule build_fpga_emu + command = dpcpp /GX ${emulator_flags} $in -o $out + +rule gen_report + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out + +rule gen_report_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu ${source_file} + +# report +build report: phony ${report_target} +build ${report_target}: gen_report ${source_file} + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_s10_pac} +build ${report_target_s10_pac}: gen_report_s10_pac ${source_file} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array.cpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array.cpp new file mode 100755 index 0000000000..e5bcbbaec1 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array.cpp @@ -0,0 +1,177 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include +#include +#include +#include "dpc_common.hpp" +#include "pipe_array.hpp" +#include "unroller.hpp" + +using namespace sycl; + +constexpr size_t kNumRows = 2; +constexpr size_t kNumCols = 2; +constexpr size_t kNumberOfConsumers = kNumRows * kNumCols; +constexpr size_t kDepth = 2; + +using ProducerToConsumerPipeMatrix = PipeArray< // Defined in "pipe_array.h". + class ProducerConsumerPipe, // An identifier for the pipe. + uint64_t, // The type of data in the pipe. + kDepth, // The capacity of each pipe. + kNumRows, // array dimension. + kNumCols // array dimension. + >; + +// Forward declaration of the kernel name +// (This will become unnecessary in a future compiler version.) +class ProducerTutorial; +template class ConsumerTutorial; + +void Producer(queue &q, buffer &input_buffer) { + std::cout << "Enqueuing producer...\n"; + + auto e = q.submit([&](handler &h) { + auto input_accessor = input_buffer.get_access(h); + auto num_elements = input_buffer.get_count(); + auto num_passes = num_elements / kNumberOfConsumers; + + // The producer kernel writes to every pipe in the 2D pipe array + h.single_task([=]() { + size_t input_idx = 0; + for (size_t pass = 0; pass < num_passes; pass++) { + // Template-based unroll (outer "i" loop) + Unroller<0, kNumRows>::Step([&input_idx, input_accessor](auto i) { + // Template-based unroll (inner "j" loop) + Unroller<0, kNumCols>::Step([&input_idx, &i, input_accessor](auto j) { + // Write a value to the pipe of the pipe array + ProducerToConsumerPipeMatrix::PipeAt::write( + input_accessor[input_idx++]); + }); + }); + } + }); + }); +} + +// Do some work on the data (any function could be substituted) +uint64_t ConsumerWork(uint64_t i) { return i * i; } + +template +void Consumer(queue &q, buffer &out_buf) { + std::cout << "Enqueuing consumer " << consumer_id << "...\n"; + + auto e = q.submit([&](handler &h) { + auto output_accessor = out_buf.get_access(h); + auto num_elements = out_buf.get_count(); + + // The consumer kernel reads from a single pipe, determined by consumer_id + h.single_task>([=]() { + constexpr size_t consumer_x = consumer_id / kNumCols; + constexpr size_t consumer_y = consumer_id % kNumCols; + for (size_t i = 0; i < num_elements; ++i) { + auto input = ProducerToConsumerPipeMatrix::PipeAt::read(); + uint64_t answer = ConsumerWork(input); + output_accessor[i] = answer; + } + }); + }); +} + +int main(int argc, char *argv[]) { + uint64_t array_size = 1; + array_size <<= 10; + + // Parse optional data size argument + if (argc > 1) { + std::string option(argv[1]); + if (option == "-h" || option == "--help") { + std::cout << "Usage: \n \n\nFAILED\n"; + return 1; + } else { + array_size = std::stoi(option); + } + } + + std::cout << "Input Array Size: " << array_size << "\n"; + + // Check input validity + if (array_size % kNumberOfConsumers != 0) { + std::cout << "Array size must be a multiple of the number of consumers! " + "Exiting...\n"; + return 0; + } + + // Set up producer input vector, and kNumberOfConsumers output vectors + uint64_t items_per_consumer = array_size / kNumberOfConsumers; + std::vector producer_input(array_size, -1); + std::array, kNumberOfConsumers> consumer_output; + + for (auto &output : consumer_output) + output.resize(items_per_consumer, -1); + + // Initialize producer input + for (size_t i = 0; i < array_size; i++) + producer_input[i] = i; + +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector device_selector; +#else + intel::fpga_selector device_selector; +#endif + + try { + queue q(device_selector, dpc_common::exception_handler); + + // Enqueue producer + buffer producer_buffer(producer_input); + Producer(q, producer_buffer); + + // Use verbose SYCL 1.2 syntax for the output buffer. + // (This will become unnecessary in a future compiler version.) + std::vector> consumer_buffers; + + // Use template-based unroll to enqueue multiple consumers + Unroller<0, kNumberOfConsumers>::Step([&](auto consumer_id) { + consumer_buffers.emplace_back(consumer_output[consumer_id].data(), + items_per_consumer); + Consumer(q, consumer_buffers.back()); + }); + + } catch (sycl::exception const &e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + + // Verify result + for (size_t i = 0; i < items_per_consumer; ++i) { + for (size_t consumer = 0; consumer < kNumberOfConsumers; ++consumer) { + auto fpga_result = consumer_output[consumer][i]; + auto expected_result = ConsumerWork(kNumberOfConsumers * i + consumer); + if (fpga_result != expected_result) { + std::cout << "FAILED: The results are incorrect\n"; + std::cout << "On Input: " << kNumberOfConsumers * i + consumer + << " Expected: " << expected_result << " Got: " << fpga_result + << "\n"; + return 1; + } + } + } + + std::cout << "PASSED: The results are correct\n"; + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array.hpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array.hpp new file mode 100755 index 0000000000..cbcefd36b8 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array.hpp @@ -0,0 +1,33 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include + +#include "pipe_array_internal.hpp" + +template +struct PipeArray { + PipeArray() = delete; + + template + struct StructId; + + template + struct VerifyIndices { + static_assert(sizeof...(idxs) == sizeof...(dims), + "Indexing into a PipeArray requires as many indices as " + "dimensions of the PipeArray."); + static_assert(VerifierDimLayer::template VerifierIdxLayer< + idxs...>::IsValid(), + "Index out of bounds"); + using VerifiedPipe = + cl::sycl::intel::pipe, BaseTy, depth>; + }; + + template + using PipeAt = typename VerifyIndices::VerifiedPipe; +}; diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array_internal.hpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array_internal.hpp new file mode 100755 index 0000000000..1b62f667f2 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array_internal.hpp @@ -0,0 +1,26 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +namespace { +template +struct VerifierDimLayer { + template + struct VerifierIdxLayer { + static constexpr bool IsValid() { + return idx1 < dim1 && + (VerifierDimLayer::template VerifierIdxLayer< + idxs...>::IsValid()); + } + }; +}; +template +struct VerifierDimLayer { + template + struct VerifierIdxLayer { + static constexpr bool IsValid() { return idx < dim; } + }; +}; +} // namespace diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/unroller.hpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/unroller.hpp new file mode 100755 index 0000000000..4bfb9422bd --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/unroller.hpp @@ -0,0 +1,15 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +template struct Unroller { + template static void Step(const Action &action) { + action(std::integral_constant()); + Unroller::Step(action); + } +}; + +template struct Unroller { + template static void Step(const Action &) {} +}; diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/CMakeLists.txt new file mode 100755 index 0000000000..367086979c --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(RemoveLoopCarriedDependency) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/README.md b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/README.md new file mode 100755 index 0000000000..37e8edeeaf --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/README.md @@ -0,0 +1,176 @@ +# Removing Loop Carried Dependencies +This tutorial demonstrates how to remove a loop-carried dependency to improve the performance of FPGA device code. + +***Documentation***: The [FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a resource for general target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | A technique to remove loop carried dependencies from your FPGA device code, and when to apply it +| Time to complete | 25 minutes + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + +## Purpose +This tutorial demonstrates how to remove a loop-carried dependency in FPGA device code. A snippet of the baseline unoptimized code (the `Unoptimized` function in `src/loop_carried_dependency.cpp`) is given below: + +``` +double sum = 0; +for (size_t i = 0; i < N; i++) { + for (size_t j = 0; j < N; j++) { + sum += a[i * N + j]; + } + sum += b[i]; +} +result[0] = sum; +``` + +In the unoptimized kernel, a sum is computed over two loops. The inner loop sums over the `a` data and the outer loop over the `b` data. Since the value `sum` is updated in both loops, this introduces a _loop carried dependency_ that causes the outer loop to be serialized, allowing only one invocation of the outer loop to be active at a time, which reduces performance. + +A snippet of the optimized code (the `Optimized` function in `src/loop_carried_dependency.cpp`) is given below, which removes the loop carried dependency on the `sum` variable: + +``` +double sum = 0; + +for (size_t i = 0; i < N; i++) { + // Step 1: Definition + double sum_2 = 0; + + // Step 2: Accumulation of array A values for one outer loop iteration + for (size_t j = 0; j < N; j++) { + sum_2 += a[i * N + j]; + } + + // Step 3: Addition of array B value for an outer loop iteration + sum += sum_2; + sum += b[i]; +} + +result[0] = sum; +``` + +The optimized kernel demonstrates the use of an independent variable `sum_2` that is not updated in the outer loop and removes the need to serialize the outer loop, which improves the performance. + +### When to Use This Technique +Look at the _Compiler Report > Throughput Analysis > Loop Analysis_ section in the reports. The report lists the II and details for each loop. The technique presented in this tutorial may be applicable if the _Brief Info_ of the loop shows _Serial exe: Data dependency_. The details pane may provide more information: +``` +* Iteration executed serially across _function.block_. Only a single loop iteration will execute inside this region due to data dependency on variable(s): + * sum (_filename:line_) +``` + +## Key Concepts +* Loop carried-dependencies, and their impact on FPGA DPC++ kernel performance +* An optimization technique to break loop-carried data dependencies in critical loops + +## License +This code sample is licensed under MIT license. + +## Building the `loop_carried_dependency` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + +### On a Windows* System +Note: `cmake` is not yet supported on Windows. A `build.ninja` file is provided instead. + +1. Enter the source file directory. + ``` + cd src + ``` + +2. Compile the design. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + ninja fpga_emu + ``` + + * Generate the optimization report: + + ``` + ninja report + ``` + If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use: + ``` + ninja report_s10_pac + ``` + * Compiling for FPGA hardware is not yet supported on Windows. + + +### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Examining the Reports +Locate `report.html` in the `loop_carried_dependency_report.prj/reports` or in `loop_carried_dependency_s10_pac_report.prj/reports` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. + +Navigate to the _Loops Analysis_ view of the report (under _Throughput Analysis_) and observe that the loop in block `UnOptKernel.B1` is showing _Serial exe: Data dependency_. Click on the _source location_ field in the table to see the details for the loop. The maximum interleaving iterations of the loop is 1, as the loop is serialized. + +Now, observe that the loop in block `OptKernel.B1` is not marked as _Serialized_. The maximum Interleaving iterations of the loop is now 12. + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./loop_carried_dependency.fpga_emu (Linux) + loop_carried_dependency.fpga_emu.exe (Windows) + ``` +2. Run the sample on the FPGA device: + ``` + ./loop_carried_dependency.fpga (Linux) + ``` + +### Example of Output +``` +Number of elements: 16000 +Run: Unoptimized: +kernel time : 10685.3 ms +Run: Optimized: +kernel time : 2736.47 ms +PASSED +``` +### Discussion of Results + +In the tutorial example, applying the optimization yields a total execution time reduction by almost a factor of 4. The Initiation Interval (II) for the inner loop is 12 because a double floating point add takes 11 cycles on the FPGA. + + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/loop_carried_dependency.sln b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/loop_carried_dependency.sln new file mode 100755 index 0000000000..b319c23b37 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/loop_carried_dependency.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "loop_carried_dependency", "loop_carried_dependency.vcxproj", "{49E7063B-56DA-4ACF-B153-5B56A98645BE}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {49E7063B-56DA-4ACF-B153-5B56A98645BE}.Debug|x64.ActiveCfg = Debug|x64 + {49E7063B-56DA-4ACF-B153-5B56A98645BE}.Debug|x64.Build.0 = Debug|x64 + {49E7063B-56DA-4ACF-B153-5B56A98645BE}.Release|x64.ActiveCfg = Release|x64 + {49E7063B-56DA-4ACF-B153-5B56A98645BE}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {CC320E26-0D79-434A-8E69-3F09BFB2FCF4} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/loop_carried_dependency.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/loop_carried_dependency.vcxproj new file mode 100755 index 0000000000..0ef4b0a338 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/loop_carried_dependency.vcxproj @@ -0,0 +1,160 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {49e7063b-56da-4acf-b153-5b56a98645be} + Win32Proj + loop_carried_dependency + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)loop_carried_dependency.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)loop_carried_dependency.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/sample.json new file mode 100755 index 0000000000..de8f0bb430 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/sample.json @@ -0,0 +1,51 @@ +{ + "guid": "E5C1C1FA-7FDB-4C09-8096-1812080FD6D5", + "name": "Removing Loop Carried Dependencies", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "FPGA tutorial design demonstrating performance optimization by removing loop carried dependencies", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./loop_carried_dependency.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "loop_carried_dependency.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/CMakeLists.txt new file mode 100755 index 0000000000..e194b6f754 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/CMakeLists.txt @@ -0,0 +1,88 @@ +set(SOURCE_FILE loop_carried_dependency.cpp) +set(TARGET_NAME loop_carried_dependency) + +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS "-fintelfpga") + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR") +set(EMULATOR_LINK_FLAGS "-fintelfpga") + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS}) + set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) +endif() + +# generate report +if(WIN32) + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) + +else() + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/build.ninja new file mode 100755 index 0000000000..fbbdd87caf --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/build.ninja @@ -0,0 +1,30 @@ +source_file = loop_carried_dependency.cpp +target_name = loop_carried_dependency + +emulator_target = ${target_name}.fpga_emu.exe +report_target = ${target_name}_report.a +report_target_s10_pac = ${target_name}_s10_pac_report.a + +hardware_flags = -fintelfpga -Xshardware +emulator_flags = -fintelfpga -DFPGA_EMULATOR + +rule build_fpga_emu + command = dpcpp /GX ${emulator_flags} $in -o $out + +rule gen_report + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out + +rule gen_report_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu ${source_file} + +# report +build report: phony ${report_target} +build ${report_target}: gen_report ${source_file} + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_s10_pac} +build ${report_target_s10_pac}: gen_report_s10_pac ${source_file} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/loop_carried_dependency.cpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/loop_carried_dependency.cpp new file mode 100755 index 0000000000..ab391a42c5 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/loop_carried_dependency.cpp @@ -0,0 +1,174 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include "dpc_common.hpp" + +using namespace sycl; +using namespace std; + +// Forward declare the kernel names +// (This will become unnecessary in a future compiler version.) +class UnOptKernel; +class OptKernel; + +event Unoptimized(queue &q, const vector &vec_a, + const vector &vec_b, double &result, size_t N) { + buffer b_a(vec_a); + buffer b_b(vec_b); + buffer b_result(&result, range(1)); + + auto e = q.submit([&](handler &h) { + auto a = b_a.get_access(h); + auto b = b_b.get_access(h); + auto result = b_result.get_access(h); + + h.single_task([=]() { + double sum = 0; + for (size_t i = 0; i < N; i++) { + for (size_t j = 0; j < N; j++) { + sum += a[i * N + j]; + } + sum += b[i]; + } + result[0] = sum; + }); + }); + return e; +} + +event Optimized(queue &q, const vector &vec_a, + const vector &vec_b, double &result, size_t N) { + buffer b_a(vec_a); + buffer b_b(vec_b); + buffer b_result(&result, range(1)); + + auto e = q.submit([&](handler &h) { + auto a = b_a.get_access(h); + auto b = b_b.get_access(h); + auto result = b_result.get_access(h); + + h.single_task([=]() [[intel::kernel_args_restrict]] { + double sum = 0; + + for (size_t i = 0; i < N; i++) { + // Step 1: Definition + double sum_2 = 0; + + // Step 2: Accumulation of array A values for one outer loop iteration + for (size_t j = 0; j < N; j++) { + sum_2 += a[i * N + j]; + } + + // Step 3: Addition of array B value for an outer loop iteration + sum += sum_2; + sum += b[i]; + } + + result[0] = sum; + }); + }); + return e; +} + +void PrintTime(const event &e, queue &q, const char *kind) { + double start_k = e.get_profiling_info(); + double end_k = e.get_profiling_info(); + double kernel_time = (double)(end_k - start_k) * 1e-6; + + cout << "Run: " << kind << ":\n"; + cout << "kernel time : " << kernel_time << " ms\n"; +} + +int main(int argc, char *argv[]) { + size_t n = 16000; + + if (argc > 1) { + string option(argv[1]); + if (option == "-h" || option == "--help") { + cout << "Usage: \n\nFAILED\n"; + return 1; + } else { + n = stoi(option); + } + } + // Cap the value of n. + n = std::max(std::min((size_t)n, (size_t)16000), (size_t)100); + cout << "Number of elements: " << n << '\n'; + + vector vec_a(n * n); + vector vec_b(n); + + double answer = 0; + + // initialize data and compute golden result + for (size_t i = 0; i < n; i++) { + for (size_t j = 0; j < n; j++) { + vec_a[i * n + j] = i + j; + answer += i + j; + } + vec_b[i] = i; + answer += i; + } + + // Initialize queue with device selector and enabling profiling + // Create queue, get platform and device +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector selector; + cout << "\nEmulator output does not demonstrate true hardware " + "performance. The design may need to run on actual hardware " + "to observe the performance benefit of the optimization " + "exemplified in this tutorial.\n\n"; +#else + intel::fpga_selector selector; +#endif + + double unopt_sum = -1, opt_sum = -1; + + try { + // Create a profiling queue + queue q(selector, dpc_common::exception_handler, + property::queue::enable_profiling{}); + + // compute result on device + PrintTime(Unoptimized(q, vec_a, vec_b, unopt_sum, n), q, "Unoptimized"); + PrintTime(Optimized(q, vec_a, vec_b, opt_sum, n), q, "Optimized"); + + // q's destructor invokes q's exception handler on any device exceptions. + } catch (sycl::exception const &e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + + // Check the results + bool failed = false; + if (unopt_sum != answer) { + cout << "Unoptimized: expected: " << answer << ", result: " << unopt_sum + << '\n'; + failed = true; + } + if (opt_sum != answer) { + cout << "Optimized: expected: " << answer << ", result: " << opt_sum + << '\n'; + failed = true; + } + + if (failed) { + cout << "FAILED\n"; + return 1; + } + cout << "PASSED\n"; + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/CMakeLists.txt new file mode 100755 index 0000000000..54283f46f7 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(TriangularLoop) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/README.md b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/README.md new file mode 100755 index 0000000000..ad945c0d27 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/README.md @@ -0,0 +1,295 @@ + +# Triangular Loop Optimization + +This FPGA tutorial demonstrates an advanced technique to improve the performance of nested triangular loops with loop-carried dependencies in single-task kernels. + +***Documentation***: The [FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a resource for general target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | How and when to apply the triangular loop optimization technique +| Time to complete | 30 minutes + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + +## Purpose + +This FPGA tutorial introduces an advanced optimization technique to improve the performance of nested triangular loops with loop-carried dependencies. Such structures are challenging to optimize because of the time-varying loop trip count. + +### What is a triangular loop? + +A triangular loop is a loop nest where the inner-loop range depends on the outer loop variable in such a way that the inner-loop trip-count shrinks or grows. This is best explained with an example: + +```c++ + for (int x = 0; x < n; x++) { + for (int y = x + 1; y < n; y++) { + local_buf[y] = local_buf[y] + SomethingComplicated(local_buf[x]); + } + } +``` + +In this example, the inner-loop executes fewer and fewer iterations as overall execution progresses. Each iteration of the inner-loop performs a read from index `[x]` and a read-modify-write on indices `[y]=x+1` to `[y]=n-1`. Expressed graphically (with _n_=10), these operations look like: + +```c++ + y=0 1 2 3 4 5 6 7 8 9 +========================== +x=0 o x x x x x x x x x +x=1 o x x x x x x x x +x=2 o x x x x x x x +x=3 o x x x x x x +x=4 o x x x x x +x=5 o x x x x +x=6 o x x x +x=7 o x x +x=8 o x +x=9 + +Legend: read="o", read-modify-write="x" +``` + +The picture is triangular in shape, hence the name "triangular loop". + +### Performance challenge + +In the above example, the table shows that in outer-loop iteration `x=0`, the program reads `local_buf[x=0]` and reads, modifies, and writes the values from `local_buf[y=1]` through `local_buf[y=9]`. This pattern of memory accesses results in a loop-carried dependency across the outer loop iterations. For example, the read at `x=2` depends on the value that was written at `x=1,y=2`. + +Generally, a new iteration is launched on every cycle as long as a sufficient number of inner-loop +iterations are executed *between* any two iterations that are dependent on one another. + +However, the challenge in the triangular loop pattern is that the trip-count of the inner-loop +progressively shrinks as `x` increments. In the worst case of `x=7`, the program writes to `local_buf[y=8]` in the first `y` iteration, but has only one intervening `y` iteration at `y=9` before the value must be read again at `x=8,y=8`. This may not allow enough time for the write operation to complete. The compiler compensates for this by increasing the initiation interval (II) of the inner-loop to allow more time to elapse between iterations. Unfortunately, this reduces the throughput of the inner-loop by a factor of II. + +A key observation is that this increased II is only functionally necessary when the inner-loop trip-count becomes small. Furthermore, the II of a loop is static -- it applies for all invocations of that loop. Therefore, if the *outer-loop* trip-count (_n_) is large, then most of the invocations of the inner-loop unnecessarily suffer the aforementioned throughput degradation. The optimization technique demonstrated in this tutorial addresses this issue. + +### Optimization concept + +The triangular loop optimization alters the code to guarantee that the trip count never falls below some minimum (_M_). This is accomplished by executing extra 'dummy' iterations of the inner loop when the *true* trip count falls below _M_. + +The purpose of the dummy iterations is to allow extra time for the loop-carried dependency to resolve. No actual computation (or side effects) take place during these added iterations. Note that the extra iterations are only executed on inner loop invocations that require them. When the inner-loop trip count is large, extra iterations are not needed. + +This technique allows the compiler to achieve II=1. + +Applying the triangular loop optimization to the original example, the post-optimization execution graph for _M_=6 (with _n_=10) appears as follows: + +```c++ + y=0 1 2 3 4 5 6 7 8 9 +========================== +x=0 o x x x x x x x x x +x=1 o x x x x x x x x +x=2 o x x x x x x x +x=3 o x x x x x x +x=4 o x x x x x +x=5 - o x x x x +x=6 - - o x x x +x=7 - - - o x x +x=8 - - - - o x +x=9 + <---M=6---> + +Legend: read="o", read-modify-write="x", dummy iteration="-" +``` + +### Selecting the value of _M_ + +The objective is to find the minimal value of _M_ that enables the compiler to achieve an II of 1. Any value of _M_ larger than this minimum adds unnecessary latency to the computation. + +A good starting point of the value of _M_ is the II of the unoptimized inner loop, which can be found in the "Loop Analysis" report of the unoptimized code. If the compiler can achieve II=1 with this starting value, experiment with reducing _M_ until II increases. If the compiler does not achieve II=1, increase _M_ until it does. This search for the optimal _M_ can be done quickly, as the compiler takes little time to generate the static optimization report. + +### Applying the optimization in code + +Here is the triangular loop optimization of the original code snippet: +```c++ +// Indices to track the execution in the merged loop +int x = 0, y = 1; + +// Total iterations of the merged loop +const int loop_bound = TotalIterations(M, n); + +[[intelfpga::ivdep(M)]] +for (int i = 0; i < loop_bound; i++) { + + // Determine if this is a real or dummy iteration + bool compute = y > x; + if (compute) { + local_buf[y] = local_buf[y] + SomethingComplicated(local_buf[x]); + } + + y++; + if (y == n) { + x++; + y = Min(n - M, x + 1); + } +} +``` +This requires some explanation! + +***Single loop:*** Notice that the original nested loop has been manually coalesced or "merged" into a single loop. The explicit `x` and `y` induction variables are employed to achieve the triangular iteration pattern. The actual computation inside the loop is guarded by the condition `y > x`. + +***Merged loop trip count:*** The total trip-count of this merged loop is `loop_bound` in the snippet . The value of `loop_bound` is the total number of iterations in the execution graph diagram, which is a function of _n_ and _M_. + +To derive the expression for `TotalIterations(M, n)`, consider the iterations as consisting of the following two triangles of "real" and "dummy" iterations. + +```c++ + y=0 1 2 3 4 5 6 7 8 9 y=0 1 2 3 4 5 6 7 8 9 +========================= ========================= +x=0 o x x x x x x x x x x=0 +x=1 o x x x x x x x x x=1 +x=2 o x x x x x x x x=2 +x=3 o x x x x x x x=3 +x=4 o x x x x x + x=4 +x=5 o x x x x x=5 - +x=6 o x x x x=6 - - +x=7 o x x x=7 - - - +x=8 o x x=8 - - - - +x=9 + <(M-2)> + <---M=6---> +``` +The number of "real" iterations on the left is 10+9+8+7+6+5+4+3+2 = 54. The formula for a +descending series from `n` is `n*(n+1)/2`. Since there is no iteration at `x=9,y=9`, subtract 1 (i.e., `n*(n+1)/2 - 1`). When _n_=10, this expression yields 54, as expected. + +The number of dummy iterations on the right is 4+3+2+1 = 10. The largest number in this series is _M_-2. Using the same formula for a descending series , you get `(M-2)*(M-1)/2`. For _M_=6, this this expression yields 4*5/2 = 10, as expected. + +Summing the number of real and dummy iterations gives the total iterations of the merged loop. + +***Use of ivdep***: Since the loop is restructured to ensure that a minimum of M iterations are executed, the `[[intelfpga::ivdep(M)]]` is used to hint to the compiler that iterations with dependencies are always separated by at least M iterations. + + + +## Key Concepts +* The triangular loop advanced optimization technique, and situations in which it is applicable +* Using `ivdep safelen` to convey the broken loop-carried dependency to the compiler + +## License +This code sample is licensed under MIT license. + + +## Building the `triangular_loop` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + +### On a Windows* System +Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. + +1. Enter the source file directory. + ``` + cd src + ``` + +2. Compile the design. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + ninja fpga_emu + ``` + + * Generate the optimization report: + + ``` + ninja report + ``` + If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use: + ``` + ninja report_s10_pac + ``` + * Compiling for FPGA hardware is not yet supported on Windows. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Examining the Reports +Locate `report.html` in the `triangular_loop_report.prj/reports/` or `triangular_loop_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. + +Consult the "Loop Analysis" report to compare the optimized and unoptimized versions of the loop. + + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./triangular_loop.fpga_emu (Linux) + triangular_loop.fpga_emu.exe (Windows) + ``` +2. Run the sample on the FPGA device: + ``` + ./triangular_loop.fpga (Linux) + ``` + +### Example of Output + +``` +Platform name: Intel(R) FPGA SDK for OpenCL(TM) +Device name: pac_a10 : Intel PAC Platform (pac_ec00000) + + +Length of input array: 8192 + +Beginning run without triangular loop optimization. + +Verification PASSED + +Execution time: 4.240185 seconds +Throughput without optimization: 30.187364 MB/s + +Beginning run with triangular loop optimization. + +Verification PASSED + +Execution time: 0.141516 seconds +Throughput with optimization: 904.489876 MB/s + +``` + +### Discussion of Results +A test compile of this tutorial design achieved an fMAX of approximately 210 MHz on the Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA. The results with and without the optimization are shown in the following table: + +Configuration | Overall Execution Time (ms) | Throughput (MB/s) +-|-|- +Without optimization | 4972 | 25.7 +With optimization | 161 | 796.6 + +Without optimization, the compiler achieved an II of 30 on the inner-loop. With the optimization, the compiler achieves an II of 1 and the throughput increased by approximately 30x. + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/sample.json new file mode 100755 index 0000000000..7dc1d09170 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/sample.json @@ -0,0 +1,51 @@ +{ + "guid": "884439A5-0286-447B-9E6D-A7C22B61CED8", + "name": "Triangular Loop Optimization", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "FPGA tutorial demonstrating an advanced optimization technique for triangular loops", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./triangular_loop.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "triangular_loop.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/CMakeLists.txt new file mode 100755 index 0000000000..04d6c7add8 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/CMakeLists.txt @@ -0,0 +1,88 @@ +set(SOURCE_FILE triangular_loop.cpp) +set(TARGET_NAME triangular_loop) + +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS "-fintelfpga") + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR") +set(EMULATOR_LINK_FLAGS "-fintelfpga") + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS}) + set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) +endif() + +# generate report +if(WIN32) + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) + +else() + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/build.ninja new file mode 100755 index 0000000000..f13a484a51 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/build.ninja @@ -0,0 +1,30 @@ +source_file = triangular_loop.cpp +target_name = triangular_loop + +emulator_target = ${target_name}.fpga_emu.exe +report_target = ${target_name}_report.a +report_target_s10_pac = ${target_name}_s10_pac_report.a + +hardware_flags = -fintelfpga -Xshardware +emulator_flags = -fintelfpga -DFPGA_EMULATOR + +rule build_fpga_emu + command = dpcpp /GX ${emulator_flags} $in -o $out + +rule gen_report + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out + +rule gen_report_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu ${source_file} + +# report +build report: phony ${report_target} +build ${report_target}: gen_report ${source_file} + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_s10_pac} +build ${report_target_s10_pac}: gen_report_s10_pac ${source_file} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/triangular_loop.cpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/triangular_loop.cpp new file mode 100755 index 0000000000..d3a5386bd6 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/triangular_loop.cpp @@ -0,0 +1,255 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include +#include "dpc_common.hpp" + +using namespace sycl; + +// Seed for randomizing data inputs +constexpr int kInitSeed = 42; + +// This tutorial runs twice to show the impact with +// and without the optimization. +constexpr int kNumRuns = 2; + +// number of nanoseconds in a second +constexpr double kNs = 1000000000.0; + +// Number of inputs. Don't set this too large, otherwise +// computation of the reference solution will take a long time on +// the host (the time is proportional to kSize^2) +constexpr int kSize = 8 * 1024; + +// >=1. Minimum number of iterations of the inner loop that must be +// executed in the optimized implementation. Set this approximately +// equal to the ii of inner loop in the unoptimized implementation. +constexpr int kM = 50; + +// do not use with unary operators, e.g., kMin(x++, y++) +constexpr int Min(int X, int Y) { return (((X) < (Y)) ? (X) : (Y)); }; + +// Forward declaration of kernel +class Task; + +// This method represents the operation you perform on the loop-carried variable +// in the triangular loop (i.e. a dot product or something that may take many +// cycles to complete). +int SomethingComplicated(int x) { return (int)sycl::sqrt((float)x); } + +// This kernel function implements two data paths: with and without the +// optimization. 'optimize' specifies which path to take. +void TriangularLoop(std::unique_ptr& q, buffer& input_buf, + buffer& output_buf, uint32_t n, event& e, + bool optimize) { + // Enqueue kernel + e = q->submit([&](handler& h) { + // Get accessors to the SYCL buffers + auto input = input_buf.get_access(h); + auto output = output_buf.get_access(h); + + h.single_task([=]() [[intel::kernel_args_restrict]] { + // See README for description of the loop_bound calculation. + const int real_iterations = (n * (n + 1) / 2 - 1); + const int extra_dummy_iterations = (kM - 2) * (kM - 1) / 2; + const int loop_bound = real_iterations + extra_dummy_iterations; + + // Local memory for the buffer to be operated on + uint32_t local_buf[kSize]; + + // Read the input_buf from global mem and load it into the local mem + for (uint32_t i = 0; i < kSize; i++) { + local_buf[i] = input[i]; + } + + // Perform the triangular loop computation + + if (!optimize) { // Unoptimized loop. + + for (int x = 0; x < n; x++) { + for (int y = x + 1; y < n; y++) { + local_buf[y] = local_buf[y] + SomethingComplicated(local_buf[x]); + } + } + + } else { // Optimized loop. + + // Indices to track the execution inside the single, merged loop. + int x = 0, y = 1; + + // Specify that the minimum dependence-distance of loop-carried + // variables is kM iterations. We ensure this is true by modifying the y + // index such that a minimum of kM iterations are always executed. + [[intelfpga::ivdep(kM)]] for (int i = 0; i < loop_bound; i++) { + // Determine if this iteration is a dummy iteration or a real + // iteration in which the computation should be performed. + bool compute = y > x; + // Perform the computation if needed. + if (compute) { + local_buf[y] = local_buf[y] + SomethingComplicated(local_buf[x]); + } + // Figure out the next value for the indices. + y++; + + // If we've hit the end, set y such that a minimum of kM + // iterations are exected. + if (y == n) { + x++; + y = Min(n - kM, x + 1); + } + } + } + + // Write the output to global mem + for (uint32_t i = 0; i < kSize; i++) { + output[i] = local_buf[i]; + } + }); + }); + +} + +int main() { + + // Host and kernel profiling + event e; + ulong t1_kernel, t2_kernel; + double time_kernel; +// Create queue, get platform and device +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector device_selector; + std::cout << "\nEmulator output does not demonstrate true hardware " + "performance. The design may need to run on actual hardware " + "to observe the performance benefit of the optimization " + "exemplified in this tutorial.\n\n"; +#else + intel::fpga_selector device_selector; +#endif + + try { + auto prop_list = + property_list{property::queue::enable_profiling()}; + + std::unique_ptr q; + q.reset(new queue(device_selector, dpc_common::exception_handler, prop_list)); + + platform platform = q->get_context().get_platform(); + device device = q->get_device(); + std::cout << "Platform name: " + << platform.get_info().c_str() << "\n"; + std::cout << "Device name: " + << device.get_info().c_str() << "\n\n\n"; + + // Create input and output buffers + auto input_buf = buffer(range<1>(kSize)); + auto output_buf = buffer(range<1>(kSize)); + + srand(kInitSeed); + + // Compute the reference solution + uint32_t gold[kSize]; + + { + // Get host-side accessors to the SYCL buffers. + auto input_host = input_buf.get_access(); + + // Initialize random input + for (int i = 0; i < kSize; ++i) { + input_host[i] = rand() % 256; + } + + for (int i = 0; i < kSize; ++i) { + gold[i] = input_host[i]; + } + } + + // Host accessor now out-of-scope and is destructed. This is required in + // order to unblock the kernel's subsequent accessor to the same buffer. + + for (int x = 0; x < kSize; x++) { + for (int y = x + 1; y < kSize; y++) { + gold[y] += SomethingComplicated(gold[x]); + } + } + + std::cout << "Length of input array: " << kSize << "\n\n"; + + for (int i = 0; i < kNumRuns; i++) { + switch (i) { + case 0: { + std::cout + << "Beginning run without triangular loop optimization.\n\n"; + TriangularLoop(q, input_buf, output_buf, kSize, e, false); + break; + } + case 1: { + std::cout << "Beginning run with triangular loop optimization.\n\n"; + TriangularLoop(q, input_buf, output_buf, kSize, e, true); + break; + } + default: { + TriangularLoop(q, input_buf, output_buf, kSize, e, false); + } + } + + // Wait for kernels to finish + q->wait(); + + t1_kernel = e.get_profiling_info(); + t2_kernel = e.get_profiling_info(); + time_kernel = (t2_kernel - t1_kernel) / kNs; + + // Get accessor to output buffer. Accessing the buffer at this point in + // the code will block on kernel completion. + auto output_host = output_buf.get_access(); + + // Verify output and print pass/fail + bool passed = true; + int num_errors = 0; + for (int b = 0; b < kSize; b++) { + if (num_errors < 10 && output_host[b] != gold[b]) { + passed = false; + std::cout << " Mismatch at element " << b << ". expected " << gold[b] + << ")\n"; + num_errors++; + } + } + + if (passed) { + std::cout << "Verification PASSED\n\n"; + + // Report host execution time and throughput + std::cout.setf(std::ios::fixed); + std::cout << "Execution time: " << time_kernel << " seconds\n"; + int num_iterations = + kSize * (kSize + 1) / 2 - + 1; // One piece of data is processed on each iteration. This + // formula is taken from the loop_bound calculation. + double N_MB = (sizeof(uint32_t) * num_iterations) / + (1024 * 1024); // Amount of data processed, in mB + std::cout << "Throughput " << (i == 0 ? "without" : "with") + << " optimization: " << N_MB / time_kernel << " MB/s\n\n"; + } else { + std::cout << "Verification FAILED\n"; + return 1; + } + } + } catch (sycl::exception const& e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/triangular_loop.sln b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/triangular_loop.sln new file mode 100755 index 0000000000..dba49d0132 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/triangular_loop.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "triangular_loop", "triangular_loop.vcxproj", "{B9324A38-DD67-4220-9EC3-42A8ACBDC4F5}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {B9324A38-DD67-4220-9EC3-42A8ACBDC4F5}.Debug|x64.ActiveCfg = Debug|x64 + {B9324A38-DD67-4220-9EC3-42A8ACBDC4F5}.Debug|x64.Build.0 = Debug|x64 + {B9324A38-DD67-4220-9EC3-42A8ACBDC4F5}.Release|x64.ActiveCfg = Release|x64 + {B9324A38-DD67-4220-9EC3-42A8ACBDC4F5}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {AF287516-09DE-4A70-AF44-3C4F5D850105} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/triangular_loop.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/triangular_loop.vcxproj new file mode 100755 index 0000000000..6d5fc1777b --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/triangular_loop.vcxproj @@ -0,0 +1,160 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {b9324a38-dd67-4220-9ec3-42a8acbdc4f5} + Win32Proj + triangular_loop + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)triagular_loop.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)triagular_loop.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/CMakeLists.txt new file mode 100755 index 0000000000..325cc3fa42 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(FPGARegister) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/README.md new file mode 100755 index 0000000000..18e2a1f244 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/README.md @@ -0,0 +1,188 @@ +# Explicit Pipeline Register Insertion with `fpga_reg` + +This FPGA tutorial demonstrates how a power user can apply the DPC++ extension `intel::fpga_reg` to tweak the hardware generated by the compiler. + +***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | How to use the `intel::fpga_reg` extension
How `intel::fpga_reg` can be used to re-structure the compiler-generated hardware
Situations in which applying `intel::fpga_reg` might be beneficial +| Time to complete | 20 minutes + +_Notice: This code sample is not yet supported in Windows*_ + +## Purpose + +This FPGA tutorial demonstrates an example of using the `intel::fpga_reg` extension to: + +* Help reduce the fanout of specific signals in the DPC++ design +* Improve the overall fMAX of the generated hardware + +Note that this is an advanced tutorial for FPGA power users. + +### Simple Code Example + +The signature of `intel::fpga_reg` is as follows: + +```cpp +template +T intel::fpga_reg(T input) +``` + +To use this function in your code, you must include the following header: + +```cpp +#include +``` + +When you use this function on any value in your code, the compiler will insert at least one register stage between the input and output of `intel::fpga_reg` function. For example: + +```cpp +int func (int input) { + int output = intel::fpga_reg(input) + return output; +} +``` + +This forces the compiler to insert a register between the input and output. You can observe this in the optimization report's System Viewer. + +### Understanding the Tutorial Design + +The basic function performed by the tutorial kernel is a vector dot product with a pre-adder. The loop is unrolled so that the core part of the algorithm is a feed-forward datapath. The coefficient array is implemented as a circular shift register and rotates by one for each iteration of the outer loop. + +The optimization applied in this tutorial impacts the system fMAX or the maximum frequency that the design can run at. Since the compiler implements all kernels in a common clock domain, fMAX is a global system parameter. To see the impact of the `intel::fpga_reg` optimization in this tutorial, you will need to compile the design twice. + +Part 1 compiles the kernel code without setting the `USE_FPGA_REG` macro, whereas Part 2 compiles the kernel while setting this macro. This chooses between two code segments that are functionally equivalent, but the latter version makes use of `intel::fpga_reg`. In the `USE_FPGA_REG` version of the code, the compiler is guaranteed to insert at least one register stage between the input and output of each of the calls to `intel::fpga_reg` function. + +#### Part 1: Without `USE_FPGA_REG` + +The compiler will generate the following hardware for Part 1. The diagram below has been simplified for illustration. + +Part 1 + +Note the following: + +* The compiler automatically infers a tree structure for the series of adders. +* There is a large fanout (of up to 4 in this simplified example) from `val` to each of the adders. + +The fanout grows linearly with the unroll factor in this tutorial. In FPGA designs, signals with large fanout can sometimes degrade system fMAX. This happens because the FPGA placement algorithm cannot place *all* of the fanout logic elements physically close to the fanout source, leading to longer wires. In this situation, it can be helpful to add explicit fanout control in your DPC++ code via `intel::fpga_reg`. This is an advanced optimization for FPGA power-users. + +#### Part 2: with `USE_FPGA_REG` + +In this part, we added two sets of `intel::fpga_reg` within the unrolled loop. The first is added to pipeline `val` once per iteration. This reduce the fanout of `val` from 4 in the example in Part 1 to just 2. The second `intel::fpga_reg` is inserted between accumulation into the `acc` value. This generates the following structure in hardware. + +Part 2 + +In this version, the adder tree has been transformed into a vine-like structure. This increases latency, but it helps us achieve our goal of reducing the fanout and improving fMAX. +Since the outer loop in this tutorial is pipelined and has a high trip count, the increased latency of the inner loop has negligible impact on throughput. The tradeoff pays off, as the fMAX improvement yields a higher performing design. + +## Key Concepts + +* How to use the `intel::fpga_reg` extension +* How `intel::fpga_reg` can be used to re-structure the compiler-generated hardware +* Situations in which applying `intel::fpga_reg` might be beneficial + +## License + +This code sample is licensed under MIT license. + +## Building the `fpga_reg` Design + +### Include Files + +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud + +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Install the design in `build` directory from the design directory by running `cmake`: + + ```bash + mkdir build + cd build + ``` + + If you are compiling for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + + ```bash + cmake .. + ``` + + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ```bash + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design using the generated `Makefile`. The following four build targets are provided that match the recommended development flow: + + * Compile and run for emulation (fast compile time, targets emulates an FPGA device) using: + + ```bash + make fpga_emu + ``` + + * Generate HTML optimization reports using: + + ```bash + make report + ``` + + * Compile and run on FPGA hardware (longer compile time, targets an FPGA device) using: + + ```bash + make fpga + ``` + +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA pre-compiled binary can be downloaded here. + + +### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*). +For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Examining the Reports + +Locate the pair of `report.html` files in either: + +* **Report-only compile**: `fpga_reg_report.prj` and `fpga_reg_registered_report.prj` +* **FPGA hardware compile**: `fpga_reg.prj` and `fpga_reg_registered.prj` + +Open the reports in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. Observe the structure of the design in the optimization report's System Viewer and notice the changes within `Cluster 2` of the `SimpleMath.B1` block. You can notice that in the report for Part 1, the viewer shows a much more shallow graph as compared to the one in Part 2. This is because the operations are performed much closer to one another in Part 1 as compared to Part 2. By transforming the code in Part 2, with more register stages, the compiler was able to achieve an higher fMAX. + +>**NOTE**: Only the report generated after the FPGA hardware compile will reflect the performance benefit of using the `fpga_reg` extension. The difference is *not* apparent in the reports generated by `make report` because a design's fMAX cannot be predicted. The final achieved fMAX can be found in `fpga_reg.prj/reports/report.html` and `fpga_reg_registered.prj/reports/report.html` (after `make fpga` completes). + +## Running the Sample + +1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + + ```bash + ./fpga_reg.fpga_emu # Linux + ``` + +2. Run the sample on the FPGA device + + ```bash + ./fpga_reg.fpga # Linux + ./fpga_reg_registered.fpga # Linux + ``` + +### Example of Output + +```txt +Throughput for kernel with input size 1000000 and coefficient array size 64: 2.819272 GFlops +PASSED: Results are correct. +``` + +### Discussion of Results + +You will be able to observe the improvement in the throughput going from Part 1 to Part 2. You will also note that the fMAX of Part 2 is significantly larger than of Part 1. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/fpga_reg.png b/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/fpga_reg.png new file mode 100755 index 0000000000..fe33916939 Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/fpga_reg.png differ diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/no_fpga_reg.png b/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/no_fpga_reg.png new file mode 100755 index 0000000000..5383063625 Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/no_fpga_reg.png differ diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/sample.json new file mode 100755 index 0000000000..57573588f4 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/sample.json @@ -0,0 +1,34 @@ +{ + "guid": "D661A5C2-5FE0-40F2-BFE7-70E3BA60F088", + "name": "Explicit Pipeline Register Insertion with fpga_reg", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "FPGA advanced tutorial demonstrating how to apply the DPC++ extension intel::fpga_reg", + "toolchain": ["dpcpp"], + "os": ["linux"], + "targetDevice": ["FPGA"], + "builder": ["cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./fpga_reg.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/src/CMakeLists.txt new file mode 100755 index 0000000000..2880b9dcf9 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/src/CMakeLists.txt @@ -0,0 +1,111 @@ +set(SOURCE_FILE fpga_reg.cpp) +set(TARGET_NAME fpga_reg) +set(TARGET_NAME_REG fpga_reg_registered) +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) +set(FPGA_TARGET_REG ${TARGET_NAME_REG}.fpga) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS "-fintelfpga") + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR") +set(EMULATOR_LINK_FLAGS "-fintelfpga") + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) + +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_executable(${FPGA_TARGET_REG} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_custom_target(fpga DEPENDS ${FPGA_TARGET} ${FPGA_TARGET_REG}) + + set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS}) + set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) + + set_target_properties(${FPGA_TARGET_REG} PROPERTIES COMPILE_FLAGS "${HARDWARE_COMPILE_FLAGS} -DUSE_FPGA_REG") + set_target_properties(${FPGA_TARGET_REG} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) +endif() + +# report +if(WIN32) + set(REPORT ${TARGET_NAME}_report.a) + set(REPORT_REG ${TARGET_NAME_REG}_report.a) + + add_custom_target(report DEPENDS ${REPORT} ${REPORT_REG}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}") + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${CMAKE_BINARY_DIR}/${TARGET_NAME}/${SOURCE_FILE} COPYONLY) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${CMAKE_BINARY_DIR}/${TARGET_NAME_REG}/${SOURCE_FILE} COPYONLY) + + add_custom_command(OUTPUT ${REPORT} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_BINARY_DIR}/${TARGET_NAME}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORT} + DEPENDS ${SOURCE_FILE}) + + add_custom_command(OUTPUT ${REPORT_REG} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -DUSE_FPGA_REG -fsycl-link ${CMAKE_BINARY_DIR}/${TARGET_NAME_REG}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORT_REG} + DEPENDS ${SOURCE_FILE}) + +else() + set(REPORT ${TARGET_NAME}_report.a) + set(REPORT_REG ${TARGET_NAME_REG}_report.a) + + add_custom_target(report DEPENDS ${REPORT} ${REPORT_REG}) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${REPORT} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORT} + DEPENDS ${SOURCE_FILE}) + + add_custom_command(OUTPUT ${REPORT_REG} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -DUSE_FPGA_REG -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORT_REG} + DEPENDS ${SOURCE_FILE}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/src/fpga_reg.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/src/fpga_reg.cpp new file mode 100755 index 0000000000..c15255631b --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/fpga_reg/src/fpga_reg.cpp @@ -0,0 +1,216 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include +#include +#include +#include "dpc_common.hpp" + +using namespace sycl; +using namespace std; + +// Artificial coefficient and offset data for our math function +constexpr size_t kSize = 64; +constexpr std::array kCoeff = { + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, + 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}; +constexpr std::array kOffset = { + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, + 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; + +// The function our kernel will compute +// The "golden result" will be computed on the host to check the kernel result. +vector GoldenResult(vector vec) { + + // The coefficients will be modified with each iteration of the outer loop. + std::array coeff = kCoeff; + + for (int &val : vec) { + // Do some arithmetic + int acc = 0; + for (size_t i = 0; i < kSize; i++) { + acc += coeff[i] * (val + kOffset[i]); + } + + // Update coeff by rotating the values of the array + int tmp = coeff[0]; + for (size_t i = 0; i < kSize - 1; i++) { + coeff[i] = coeff[i + 1]; + } + coeff[kSize - 1] = tmp; + + // Result + val = acc; + } + + return vec; +} + +// Forward declaration of the kernel name +// (This will become unnecessary in a future compiler version.) +class SimpleMath; + +void RunKernel(const device_selector &selector, + const std::vector &vec_a, + std::vector &vec_r) { + + size_t input_size = vec_a.size(); + + try { + queue q(selector, dpc_common::exception_handler, + property::queue::enable_profiling{}); + + buffer device_a(vec_a); + // Use verbose SYCL 1.2 syntax for the output buffer. + // (This will become unnecessary in a future compiler version.) + buffer device_r(vec_r.data(), input_size); + + event e = q.submit([&](handler &h) { + auto a = device_a.get_access(h); + auto r = device_r.get_access(h); + + // FPGA-optimized kernel + // Using kernel_args_restrict tells the compiler that the input + // and output buffers won't alias. + h.single_task([=]() [[intel::kernel_args_restrict]] { + + // Force the compiler to implement the coefficient array in FPGA + // pipeline registers rather than in on-chip memory. + [[intelfpga::register]] std::array coeff = kCoeff; + + // The compiler will pipeline the outer loop. + for (size_t i = 0; i < input_size; ++i) { + int acc = 0; + int val = a[i]; + + // Fully unroll the accumulator loop. + // All of the unrolled operations can be freely scheduled by the + // DPC++ compiler's FPGA backend as part of a common data pipeline. + #pragma unroll + for (size_t j = 0; j < kSize; j++) { +#ifdef USE_FPGA_REG + // Use fpga_reg to insert a register between the copy of val used + // in each unrolled iteration. + val = intel::fpga_reg(val); + // Since val is held constant across the kSize unrolled iterations, + // the FPGA hardware structure of val's distribution changes from a + // kSize-way fanout (without fpga_reg) to a chain of of registers + // with intermediate tap offs. Refer to the diagram in the README. + + // Use fpga_reg to insert a register between each step in the acc + // adder chain. + acc = intel::fpga_reg(acc) + (coeff[j] * (val + kOffset[j])); + // This transforms a compiler-inferred adder tree into an adder + // chain, altering the structure of the pipeline. Refer to the + // diagram in the README. +#else + // Without fpga_reg, the compiler schedules the operations here + // according to its default optimization heuristics. + acc += (coeff[j] * (val + kOffset[j])); +#endif + } + + // Rotate the values of the coefficient array. + // The loop is fully unrolled. This is a cannonical code structure; + // the DPC++ compiler's FPGA backend infers a shift register here. + int tmp = coeff[0]; + #pragma unroll + for (size_t j = 0; j < kSize - 1; j++) { + coeff[j] = coeff[j + 1]; + } + coeff[kSize - 1] = tmp; + + // Result + r[i] = acc; + } + }); + }); + + // Measure kernel execution time + double start = e.get_profiling_info(); + double end = e.get_profiling_info(); + // Convert from nanoseconds to milliseconds. + double kernel_time = (end - start) * 1e-6; + + // Kernel consists of two nested loops with 3 operations in the innermost + // loop: 2 additions and 1 multiplication operation. + size_t num_ops_per_kernel = input_size * kSize * 3; + cout << "Throughput for kernel with input size " << input_size + << " and coefficient array size " << kSize << ": "; + cout << std::fixed << std::setprecision(6) + << ((double)num_ops_per_kernel / kernel_time) / 1.0e6 << " GFlops\n"; + + } catch (sycl::exception const &e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } +} + +int main(int argc, char *argv[]) { + size_t input_size = 1e6; + + // Optional command line override of default input size + if (argc > 1) { + string option(argv[1]); + if (option == "-h" || option == "--help") { + cout << "Usage: \n \n\nFAILED\n"; + return 1; + } else { + input_size = stoi(option); + } + } + + // Initialize input vector + constexpr int max_val = 1<<10; // Conservative max to avoid integer overflow + vector vec_a(input_size); + for (size_t i = 0; i < input_size; i++) { + vec_a[i] = rand() % max_val; + } + // Kernel result vector + vector vec_r(input_size); + + // Run the kernel on either the FPGA emulator, or FPGA +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector selector; +#else + intel::fpga_selector selector; +#endif + RunKernel(selector, vec_a, vec_r); + + // Test the results. + vector golden_ref = GoldenResult(vec_a); + bool correct = true; + for (size_t i = 0; i < input_size; i++) { + if (vec_r[i] != golden_ref[i]) { + cout << "Found mismatch at " << i << ", " + << vec_r[i] << " != " << golden_ref[i] << "\n"; + correct = false; + } + } + + if (correct) { + cout << "PASSED: Results are correct.\n"; + } else { + cout << "FAILED: Results are incorrect.\n"; + return 1; + } + + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/CMakeLists.txt new file mode 100755 index 0000000000..125d32c072 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(MemoryAttributesOverview) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/README.md new file mode 100755 index 0000000000..799c4bcf8b --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/README.md @@ -0,0 +1,182 @@ + +# Avoiding Aliasing of Kernel Arguments +This tutorial explains the `kernel_args_restrict` attribute and its effect on the performance of FPGA kernels. + +***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | The problem of *pointer aliasing* and its impact on compiler optimizations
The behavior of the `kernel_args_restrict` attribute and when to use it on your kernel
The effect this attribute can have on your kernel's performance on FPGA +| Time to complete | 20 minutes + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + +## Purpose +Due to pointer aliasing, the compiler must be conservative about optimizations that reorder, parallelize or overlap operations that could alias. This tutorial demonstrates the use of the DPC++ `[[intel::kernel_args_restrict]]` kernel attribute, which should be applied any time you can guarantee that kernel arguments do not alias. This attribute enables more aggressive compiler optimizations and often improves kernel performance on FPGA. + + +### What Is Pointer Aliasing? +Pointer aliasing occurs when the same memory location can be accessed using different *names* (i.e. variables). For example, consider the code below. Here, the variable `pi` can be changed one of three ways: `pi=3.14159`, `*a=3.14159` or `*b=3.14159`. In general, the compiler has to be conservative about which accesses may alias to each other and avoid making optimizations that reorder and/or parallelize operations. + +```c++ +float pi = 3.14; +float *a = π +float *b = a; +``` +### Pointer Aliasing of Arguments +Consider the function illustrated below. Though the intention of the code is clear to the reader, the compiler cannot guarantee that `in` does not alias with `out`. Imagine a degenerate case where the function was called: like this `myCopy(ptr, ptr+1, 10)`. This would cause `in[i]` and `out[i+1]` to alias to the same address, for all `i` from 0 to 9. +```c++ +void myCopy(int *in, int *out, size_t int size) { + for(size_t int i = 0; i < size; i++) { + out[i] = in[i]; + } +} +``` +This possibility of aliasing forces the compiler to be conservative. Without more information from the developer, it cannot make any optimizations which overlap, vectorize or reorder the assignment operations. Doing so would result in functionally incorrect behavior if the compiled function is called with aliasing pointers. + +If this code is compiled to FPGA, the performance penalty of this conservatism is severe. The loop in `myCopy` cannot be pipelined, because the next iteration of the loop cannot begin until the current iteration has completed. + +### A Promise to the Compiler +The developer often knows that pointer arguments will never alias in practice, as with the `myCopy` function. In your DPC++ program, you can use the `[[intel::kernel_args_restrict]]` attribute to inform the compiler that none of a kernel's arguments will alias to any another, thereby enabling more aggressive optimizations. If the non-aliasing assumption is violated at runtime, the result will be undefined behavior. + +C and OpenCL programmers may recognize this concept as the `restrict` keyword. + +### Tutorial Code Description +In this tutorial, we will show how to use the `kernel_args_restrict` attribute for your kernel and the effect it has on performance. We show two kernels that perform the same function; one with the `[[intel::kernel_args_restrict]]` applied to it and the other without. The function of the kernel is simple: copy the contents of one buffer to another. We will analyze the effect of the `[[intel::kernel_args_restrict]]` attribute on the performance of the kernel by analyzing the loop II in the reports and the latency of the kernel on actual hardware. + +## Key Concepts +* The problem of *pointer aliasing* and its impact on compiler optimizations +* The behavior of the `kernel_args_restrict` attribute and when to use it on your kernel +* The effect this attribute can have on your kernel's performance on FPGA + +## License +This code sample is licensed under MIT license. + +## Building the `kernel_args_restrict` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + +### On a Windows* System +Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. + +1. Enter the source file directory. + ``` + cd src + ``` + +2. Compile the design. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + ninja fpga_emu + ``` + + * Generate the optimization report: + + ``` + ninja report + ``` + If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use: + ``` + ninja report_s10_pac + ``` + * Compiling for FPGA hardware is not yet supported on Windows. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Examining the Reports +Locate `report.html` in the `kernel_args_restrict_report.prj/reports/` or `kernel_args_restrict_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. + +Navigate to the *Loop Analysis* report (*Throughput Analysis* > *Loop Analysis*). In the *Loop List pane* you should see two kernels: one is the kernel without the attribute applied (*KernelArgsNoRestrict*) and the other with the attribute applied (*KernelArgsRestrict*). Each kernel each has a single for-loop, which appears in the *Loop List* pane. Click on the loop under each kernel to see how it was optimized by the compiler. + +Compare the loop initiation interval (II) between the two kernels. Notice that the loop in the *KernelArgsNoRestrict* kernel has a large estimated II, while the loop in the *KernelArgsRestrict* kernel has an estimated II of ~1. These IIs are estimates because the latency of global memory accesses vary with runtime conditions. + +For the *KernelArgsNoRestrict* kernel, the compiler assumed that the kernel arguments can alias each other. Since`out[i]` and `in[i+1]` could be the same memory location, the compiler cannot overlap the iteration of the loop performing `out[i] = in[i]` with the next iteration of the loop performing `out[i+1] = in[i+1]` (and likewise for iterations `in[i+2]`, `in[i+3]`, ...). This results in an II equal to the latency of the global memory read of `in[i]` plus the latency of the global memory write to `out[i]`. + +We can confirm this by looking at the details of the loop. Click on the *KernelArgsNoRestrict* kernel in the *Loop List* pane and then click on the loop in the *Loop Analysis* pane. Now consider the *Details* pane below. You should see something like: + +- *Compiler failed to schedule this loop with smaller II due to memory dependency* + - *From: Load Operation (kernel_args_restrict.cpp: 74 > accessor.hpp: 945)* + - *To: Store Operation (kernel_args_restrict.cpp: 74)* +- *Most critical loop feedback path during scheduling:* + - *144.00 clock cycles Load Operation (kernel_args_restrict.cpp: 74 > accessor.hpp: 945)* + - *42.00 clock cycles Store Operation (kernel_args_restrict.cpp: 74)* + +The first bullet (and its sub-bullets) tell you that there is memory dependency between the load and store operations in the loop. This is the conservative pointer aliasing memory dependency described earlier. The second bullet shows you the estimated latencies for the load and store operations (note that these are board-dependent). The sum of these two latencies (plus 1) is the II of the loop. + +Next, look at the loop details of the *KernelArgsRestrict* kernel. You will notice that the *Details* pane doesn't show a memory dependency. The usage of the `[[intel::kernel_args_restrict]]` attribute allowed the compiler to schedule a new iteration of the for-loop every cycle since it knows that accesses to `in` and `out` will never alias. + + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./kernel_args_restrict.fpga_emu (Linux) + kernel_args_restrict.fpga_emu.exe (Windows) + ``` +2. Run the sample on the FPGA device: + ``` + ./kernel_args_restrict.fpga (Linux) + ``` + +### Example of Output +``` +Kernel throughput without attribute: 8.06761 MB/s +Kernel throughput with attribute: 766.873 MB/s +PASSED +``` + +### Discussion of Results + +The throughput observed when running the kernels with and without the `kernel_args_restrict` attribute should reflect the difference in loop II seen in the reports. The ratios will not exactly match because the loop IIs are estimates. An example ratio (compiled and run on the Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA) is shown. + +Attribute used? | II | Kernel Throughput (MB/s) +------------- | ------------- | -------- +No | ~187 | 8 +Yes | ~1 | 767 + +Note that this performance difference will be apparent only when running on FPGA hardware. The emulator, while useful for verifying functionality, will generally not reflect differences in performance. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/kernel_args_restrict.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/kernel_args_restrict.sln new file mode 100755 index 0000000000..7fd1d9a291 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/kernel_args_restrict.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kernel_args_restrict", "kernel_args_restrict.vcxproj", "{D6A634E7-9F2B-46C2-A21C-2402F631A55A}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {D6A634E7-9F2B-46C2-A21C-2402F631A55A}.Debug|x64.ActiveCfg = Debug|x64 + {D6A634E7-9F2B-46C2-A21C-2402F631A55A}.Debug|x64.Build.0 = Debug|x64 + {D6A634E7-9F2B-46C2-A21C-2402F631A55A}.Release|x64.ActiveCfg = Release|x64 + {D6A634E7-9F2B-46C2-A21C-2402F631A55A}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {4AC13DD2-5B0F-4051-93BF-85AEAF6E50C9} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/kernel_args_restrict.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/kernel_args_restrict.vcxproj new file mode 100755 index 0000000000..7b0b629cf5 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/kernel_args_restrict.vcxproj @@ -0,0 +1,155 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + 15.0 + {d6a634e7-9f2b-46c2-a21c-2402f631a55a} + Win32Proj + kernel_args_restricts + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/sample.json new file mode 100755 index 0000000000..45a85e30a9 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/sample.json @@ -0,0 +1,51 @@ +{ + "guid": "86066897-498B-41C5-BFA3-A03D3CAE2503", + "name": "Avoiding Aliasing of Kernel Arguments", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "Explain the `kernel_args_restrict` attribute and its effect on the performance of FPGA kernels.", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./kernel_args_restrict.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "kernel_args_restrict.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/CMakeLists.txt new file mode 100755 index 0000000000..0a4f13cefc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/CMakeLists.txt @@ -0,0 +1,94 @@ +set(SOURCE_FILE kernel_args_restrict.cpp) +set(TARGET_NAME kernel_args_restrict) +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS "-fintelfpga") + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR") +set(EMULATOR_LINK_FLAGS "-fintelfpga") + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) + +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + + set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS "${HARDWARE_COMPILE_FLAGS}") + set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) +endif() + +# report +if(WIN32) + set(REPORT ${TARGET_NAME}_report.a) + + add_custom_target(report DEPENDS ${REPORT}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}") + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${CMAKE_BINARY_DIR}/${TARGET_NAME}/${SOURCE_FILE} COPYONLY) + + add_custom_command(OUTPUT ${REPORT} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_BINARY_DIR}/${TARGET_NAME}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORT} + DEPENDS ${SOURCE_FILE}) + +else() + set(REPORT ${TARGET_NAME}_report.a) + + add_custom_target(report DEPENDS ${REPORT}) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${REPORT} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORT} + DEPENDS ${SOURCE_FILE}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/build.ninja new file mode 100755 index 0000000000..5213ba0f55 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/build.ninja @@ -0,0 +1,30 @@ +source_file = kernel_args_restrict.cpp +target_name = kernel_args_restrict + +emulator_target = ${target_name}.fpga_emu.exe +report_target = ${target_name}_report.a +report_target_s10_pac = ${target_name}_s10_pac_report.a + +hardware_flags = -fintelfpga -Xshardware +emulator_flags = -fintelfpga -DFPGA_EMULATOR + +rule build_fpga_emu + command = dpcpp /GX ${emulator_flags} $in -o $out + +rule gen_report + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out + +rule gen_report_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu ${source_file} + +# report +build report: phony ${report_target} +build ${report_target}: gen_report ${source_file} + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_s10_pac} +build ${report_target_s10_pac}: gen_report_s10_pac ${source_file} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/kernel_args_restrict.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/kernel_args_restrict.cpp new file mode 100755 index 0000000000..550f122ece --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/kernel_args_restrict.cpp @@ -0,0 +1,134 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include +#include "dpc_common.hpp" + +using namespace sycl; + +// problem input size +constexpr size_t kInSize = 1000000; +constexpr double kInputMB = (kInSize * sizeof(int)) / (1024 * 1024); +constexpr int kRandMax = 7777; + +// Forward declare the kernel names +// (This will become unnecessary in a future compiler version.) +class KernelArgsRestrict; +class KernelArgsNoRestrict; + +// Return the execution time of the event, in seconds +double GetExecutionTime(const event &e) { + double start_k = e.get_profiling_info(); + double end_k = e.get_profiling_info(); + double kernel_time = (end_k - start_k) * 1e-9; // ns to s + return kernel_time; +} + +void RunKernels(size_t size, std::vector &in, std::vector &nr_out, + std::vector &r_out) { + +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector device_selector; +#else + intel::fpga_selector device_selector; +#endif + + try { + // create the SYCL device queue + queue q(device_selector, dpc_common::exception_handler, + property::queue::enable_profiling{}); + + buffer in_buf(in); + // Use verbose SYCL 1.2 syntax for the output buffer. + // (This will become unnecessary in a future compiler version.) + buffer nr_out_buf(nr_out.data(), size); + buffer r_out_buf(r_out.data(), size); + + // submit the task that DOES NOT apply the kernel_args_restrict attribute + auto e_nr = q.submit([&](handler &h) { + auto in_acc = in_buf.get_access(h); + auto out_acc = nr_out_buf.get_access(h); + + h.single_task([=]() { + for (size_t i = 0; i < size; i++) { + out_acc[i] = in_acc[i]; + } + }); + }); + + // submit the task that DOES apply the kernel_args_restrict attribute + auto e_r = q.submit([&](handler &h) { + auto in_acc = in_buf.get_access(h); + auto out_acc = r_out_buf.get_access(h); + + h.single_task([=]() [[intel::kernel_args_restrict]] { + for (size_t i = 0; i < size; i++) { + out_acc[i] = in_acc[i]; + } + }); + }); + + // measure the execution time of each kernel + double nr_time = GetExecutionTime(e_nr); + double r_time = GetExecutionTime(e_r); + + std::cout << "Kernel throughput without attribute: " << (kInputMB / nr_time) + << " MB/s\n"; + std::cout << "Kernel throughput with attribute: " << (kInputMB / r_time) + << " MB/s\n"; + + } catch (sycl::exception const &e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } +} + +int main() { + // seed the random number generator + srand(0); + + // input/output data + std::vector in(kInSize); + std::vector nr_out(kInSize), r_out(kInSize); + + // generate some random input data + for (size_t i = 0; i < kInSize; i++) { + in[i] = rand() % kRandMax; + } + + // Run the kernels + RunKernels(kInSize, in, nr_out, r_out); + + // validate the results + for (size_t i = 0; i < kInSize; i++) { + if (in[i] != nr_out[i]) { + std::cout << "FAILED: mismatch at entry " << i + << " of 'KernelArgsNoRestrict' kernel output\n"; + return 1; + } + } + for (size_t i = 0; i < kInSize; i++) { + if (in[i] != r_out[i]) { + std::cout << "FAILED: mismatch at entry " << i + << " of 'KernelArgsRestrict' kernel output\n"; + return 1; + } + } + + std::cout << "PASSED\n"; + + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/CMakeLists.txt new file mode 100755 index 0000000000..8ab3aa3653 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(LoopCoalesce) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/README.md new file mode 100755 index 0000000000..4b2530a96a --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/README.md @@ -0,0 +1,167 @@ + +# Coalescing Nested Loops +This FPGA tutorial demonstrates applying the `loop_coalesce` attribute to a nested loop in a task kernel to reduce the area overhead. + +***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | What the `loop_coalesce` attribute does
How `loop_coalesce` attribute affects resource usage and loop throughput
How to apply the `loop_coalesce` attribute to loops in your program
Which loops make good candidates for coalescing +| Time to complete | 15 minutes + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + +## Purpose +The `loop_coalesce` attribute enables you to direct the compiler to combine nested loops into a single loop. The attribute `[[intelfpga::loop_coalesce(N)]]` takes an integer argument `N`, that specifies how many nested loop levels that you want the compiler to attempt to coalesce. + +**NOTE**: If you specify `[[intelfpga::loop_coalesce(1)]]` on nested loops, the compiler does not attempt to coalesce any of the nested loops. +### Example: Coalescing Two Loops + +``` +[[intelfpga::loop_coalesce(2)]] +for (int i = 0; i < N; i++) + for (int j = 0; j < M; j++) + sum[i][j] += i+j; +``` +The compiler coalesces the two loops together so that they execute as if they were a single loop written as follows: + +``` +int i = 0; +int j = 0; +while(i < N){ + sum[i][j] += i+j; + j++; + if (j == M){ + j = 0; + i++; + } +} +``` + +### Identifying Which Loops to Coalesce +Generally, coalescing loops can help reduce area usage by reducing the overhead needed for loop control. However, in some circumstances, coalescing loops can reduce kernel throughput. Scenarios where the `loop_coalesce` attribute can be applied to save area without a loss of throughput are those where: + + 1. The loops being coalesced have the same initiation interval (II). + 2. The exit condition computation for the resulting coalesced look is not complicated. + +If the innermost coalesced loop has a very small trip count, `loop_coalesce` might actually improve throughput. + + +## Key Concepts +* Description of the `loop_coalesce` attribute +* How `loop_coalesce` attribute affects resource usage and loop throughput +* How to apply the `loop_coalesce` attribute to loops in your program +* Determining which loops make good candidates for coalescing + +## License +This code sample is licensed under MIT license. + + +## Building the `loop_coalesce` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + +### On a Windows* System +Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. + +1. Enter the source file directory. + ``` + cd src + ``` + +2. Compile the design. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + ninja fpga_emu + ``` + + * Generate the optimization report: + + ``` + ninja report + ``` + If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use: + ``` + ninja report_s10_pac + ``` + * Compiling for FPGA hardware is not yet supported on Windows. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Examining the Reports +Locate `report.html` in the `loop_coalesce_report.prj/reports/` or `loop_coalesce_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. + +On the main report page, scroll down to the section titled `Compile Estimated Kernel Resource Utilization Summary`. Each kernel name ends in the loop_coalesce attribute argument used for that kernel, e.g., KernelCompute<2> uses a loop_coalesce argument of 2. You can verify that the number of registers, MLABs and DSPs used for each kernel decreases after nested loops are coalesced. + + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./loop_coalesce.fpga_emu (Linux) + loop_coalesce.fpga_emu.exe (Windows) + ``` +2. Run the sample on the FPGA device: + ``` + ./loop_coalesce.fpga (Linux) + ``` + +### Example of Output + +``` +Loop Coalesce: 1 -- kernel time : 156 microseconds +Throughput for kernel with coalesce_factor 1: 6550KB/S +Loop Coalesce: 2 -- kernel time : 113 microseconds +Throughput for kernel with coalesce_factor 2: 9064KB/S +PASSED: The results are correct + +``` + +### Discussion of Results +The execution time and throughput for each kernel is displayed. Applying the `loop_coalesce` attribute in this example reduced the kernel execution time by a factor of ~1.5. Note that you will only see this result when executing on FPGA hardware. The emulator will generally not reflect performance differences. + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/loop_coalesce.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/loop_coalesce.sln new file mode 100755 index 0000000000..ba59611875 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/loop_coalesce.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "loop_coalesce", "loop_coalesce.vcxproj", "{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Debug|x64.ActiveCfg = Debug|x64 + {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Debug|x64.Build.0 = Debug|x64 + {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Release|x64.ActiveCfg = Release|x64 + {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {92BEFAAB-0365-4E5A-9C4A-E50AB49B2A6B} + EndGlobalSection +EndGlobal \ No newline at end of file diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/loop_coalesce.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/loop_coalesce.vcxproj new file mode 100755 index 0000000000..ee6a1746e4 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/loop_coalesce.vcxproj @@ -0,0 +1,161 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {cf6a576b-665d-4f24-bb62-0dae7a7b3c64} + Win32Proj + loop_coalesce + 10.0.17763.0 + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)loop_coalesce.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)loop_coalesce.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/sample.json new file mode 100755 index 0000000000..c43debe7c9 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/sample.json @@ -0,0 +1,51 @@ +{ + "guid": "370A5B2B-EBB3-4E7F-89F3-73D333522215", + "name": "Coalescing Nested Loops", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "FPGA tutorial demonstrating the loop_coalesce attribute", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./loop_coalesce.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "loop_coalesce.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/CMakeLists.txt new file mode 100755 index 0000000000..bf71de4094 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/CMakeLists.txt @@ -0,0 +1,88 @@ +set(SOURCE_FILE loop_coalesce.cpp) +set(TARGET_NAME loop_coalesce) + +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS "-fintelfpga") + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR") +set(EMULATOR_LINK_FLAGS "-fintelfpga") + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS}) + set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) +endif() + +# generate report +if(WIN32) + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) + +else() + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) \ No newline at end of file diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/build.ninja new file mode 100755 index 0000000000..edc74950ec --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/build.ninja @@ -0,0 +1,30 @@ +source_file = loop_coalesce.cpp +target_name = loop_coalesce + +emulator_target = ${target_name}.fpga_emu.exe +report_target = ${target_name}_report.a +report_target_s10_pac = ${target_name}_s10_pac_report.a + +hardware_flags = -fintelfpga -Xshardware +emulator_flags = -fintelfpga -DFPGA_EMULATOR + +rule build_fpga_emu + command = dpcpp /GX ${emulator_flags} $in -o $out + +rule gen_report + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out + +rule gen_report_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu ${source_file} + +# report +build report: phony ${report_target} +build ${report_target}: gen_report ${source_file} + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_s10_pac} +build ${report_target_s10_pac}: gen_report_s10_pac ${source_file} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/loop_coalesce.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/loop_coalesce.cpp new file mode 100755 index 0000000000..a779bec4b1 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/loop_coalesce.cpp @@ -0,0 +1,176 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include +#include +#include "dpc_common.hpp" + +using namespace sycl; + +// Matrix dimensions +constexpr size_t kNumRows = 4; +constexpr size_t kNumCols = 4; +constexpr size_t kNumElements = kNumRows * kNumCols; + +// Total floating point ops performed by the kernel +constexpr size_t kTotalOps = (4 + (3*kNumCols)) * kNumElements; + + +// Forward declare the kernel name +// (This will become unnecessary in a future compiler version.) +template class KernelCompute; + +// The kernel implements a matrix multiplication. +// This is not meant to be a high performance implementation on FPGA! +// It's just a simple kernel with nested loops to illustrate loop coalescing. +template +void MatrixMultiply(const device_selector &selector, + const std::vector &matrix_a, + const std::vector &matrix_b, + std::vector &res) { + double kernel_time = 0.0; + try { + auto prop_list = property_list{property::queue::enable_profiling()}; + + queue q(selector, dpc_common::exception_handler, prop_list); + + buffer buffer_in_a(matrix_a); + buffer buffer_in_b(matrix_b); + // Use verbose SYCL 1.2 syntax for the output buffer. + // (This will become unnecessary in a future compiler version.) + buffer buffer_out(res.data(), kNumElements); + + event e = q.submit([&](handler &h) { + auto accessor_matrix_a = buffer_in_a.get_access(h); + auto accessor_matrix_b = buffer_in_b.get_access(h); + auto accessor_res = buffer_out.get_access(h); + + // The kernel_args_restrict promises the compiler that this kernel's + // accessor arguments won't alias (i.e. non-overlapping memory regions). + h.single_task>( + [=]() [[intel::kernel_args_restrict]] { + size_t idx = 0; + float a[kNumRows][kNumCols]; + float b[kNumRows][kNumCols]; + float tmp[kNumRows][kNumCols]; + + // The loop_coalesce instructs the compiler to attempt to "merge" + // coalesce_factor loop levels of this nested loop together. + // For example, a coalesce_factor of 2 turns this into a single loop. + [[intelfpga::loop_coalesce(coalesce_factor)]] + for (size_t i = 0; i < kNumRows; ++i) { + for (size_t j = 0; j < kNumCols; ++j) { + a[i][j] = accessor_matrix_a[idx]; + b[i][j] = accessor_matrix_b[idx]; + tmp[i][j] = 0.0; + idx++; + } + } + + // Applying loop_coalesce to the outermost loop of a deeply nested + // loop results coalescing from the outside in. + // For example, a coalesce_factor of 2 coalesces the "i" and "j" loops, + // making a doubly nested loop. + [[intelfpga::loop_coalesce(coalesce_factor)]] + for (size_t i = 0; i < kNumRows; ++i) { + for (size_t j = 0; j < kNumCols; ++j) { + float sum = 0.0f; + for (size_t k = 0; k < kNumCols; ++k) { + sum += a[i][k] * b[k][j]; + } + tmp[i][j] = sum; + } + } + + idx = 0; + [[intelfpga::loop_coalesce(coalesce_factor)]] + for (size_t i = 0; i < kNumRows; ++i) { + for (size_t j = 0; j < kNumCols; ++j) { + accessor_res[idx] = tmp[i][j]; + idx++; + } + } + + }); + }); + + // Kernel profiling data + double start = e.get_profiling_info(); + double end = e.get_profiling_info(); + // convert nanoseconds to microseconds + kernel_time = (double)(end - start) * 1e-3; + + } catch (exception const &exc) { + std::cout << "Caught synchronous SYCL exception:\n" << exc.what() << '\n'; + if (exc.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + + std::cout << "Loop Coalesce: " << coalesce_factor + << " -- kernel time : " << kernel_time << " microseconds\n"; + std::cout << "Throughput for kernel with coalesce_factor " << coalesce_factor + << ": "; + std::cout << std::fixed << std::setprecision(0) + << (((double)kTotalOps * sizeof(float) * 1e-3f) / + (kernel_time * 1e-6f)) << "KB/s\n"; +} + +int main() { + std::vector matrix_a(kNumElements); + std::vector matrix_b(kNumElements); + std::vector matrix_output_no_col(kNumElements); + std::vector matrix_output(kNumElements); + + // Specify the matrices to be multiplied + for (size_t i = 0; i < kNumRows; i++) { + size_t pos = i * kNumCols; + // Initialize A as identity matrix + matrix_a[i + pos] = 1.0; + for (size_t j = 0; j < kNumCols; j++) { + matrix_b[pos + j] = i * j + 1; + } + } + +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector selector; +#else + intel::fpga_selector selector; +#endif + + // Two versions of the simple matrix multiply kernel will be enqueued: + // - with coalesce_factor=1 (i.e. no loop coalescing) + // - with coalesce_factor=2 (coalesce two nested levels) + MatrixMultiply<1>(selector, matrix_a, matrix_b, matrix_output_no_col); + MatrixMultiply<2>(selector, matrix_a, matrix_b, matrix_output); + + // Correctness check + bool passed = true; + for (size_t i = 0; i < kNumRows; i++) { + size_t pos = i * kNumCols; + for (size_t j = 0; j < kNumCols; j++) { + float val_noCol = matrix_output_no_col[pos + j]; + float val = matrix_output[pos + j]; + if (val_noCol != i * j + 1 || val != i * j + 1) { + std::cout << "FAILED: The results are incorrect\n"; + passed = false; + } + } + } + + if (passed) { + std::cout << "PASSED: The results are correct\n"; + return 0; + } else { + std::cout << "FAILED\n"; + return -1; + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/CMakeLists.txt new file mode 100755 index 0000000000..3805253a8e --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(LoopIvdep) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/README.md new file mode 100755 index 0000000000..2cd79d752d --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/README.md @@ -0,0 +1,251 @@ + + +# Loop `ivdep` Attribute +This FPGA tutorial demonstrates how to applying the `ivdep` attribute to a loop to aid the compiler's loop dependence analysis. + +***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | Basics of loop-carried dependencies
The notion of a loop-carried dependence distance
What constitutes a *safe* dependence distance
How to aid the compiler's dependence analysis to maximize performance +| Time to complete | 30 minutes + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + +## Purpose +In order to understand and apply `ivdep` to loops in your design, it is necessary to understand the concepts of loop-carried memory dependencies. Unlike many other attributes that can be used to improve a design's performance, `ivdep` has functional implications. Using it incorrectly will result in undefined behavior for your design! + +### Loop-Carried Memory Dependencies +A *loop-carried memory dependency* refers to a situation where memory access in a given loop iteration cannot proceed until a memory access from a previous loop iteration is completed. Loop-carried dependencies can be categorized into the following cases: +* **True-dependence (Read-After-Write)** - A memory location read in an iteration that must occur after a previous iteration writes to the same memory location. +* **Anti-dependence (Write-After-Read)** - A memory location read must occur before a future iteration writes to the same memory location. +* **Output-dependence (Write-After-Write)** - A memory location write must occur before a future iteration writes to the same memory location. + +The Intel® oneAPI DPC++ Compiler (Beta) employs static analysis to scan the program's code to establish the dependence relationships between all memory accesses in a loop. However, depending on the complexity of the addressing expressions and the loop's stride or upper bound, the compiler may not be able to statically determine precise dependence information. + +In such scenarios, the compiler must conservatively assume some statements to be dependent in order to guarantee functional correctness of the generated hardware. Precise dependence information is crucially important to generate an efficient pipelined datapath. Such information reduces the number of assumed dependencies, allowing the hardware schedule to extract as much pipeline parallelism from loops as possible. + +#### Example 1: Basic true-dependence +Each iteration of the loop reads a value from memory location that is written to in the previous iteration. The pipelined datapath generated by the compiler cannot issue a new iteration until the previous iteration is complete. + +```c++ +for(i = 1; i < n; i++){ + S: a[i] = a[i-1]; +} +``` + +#### Example 2: Complex or statically-unknown indexing expression +The compiler cannot statically infer the true access pattern for the loads from array `a`. To guarantee functional correctness, the compiler must conservatively assume the statements in the loop to be dependent across all iterations. The resulting generated datapath issues new iterations, similar to the example 1, executing one iteration at a time. +```c++ +for(i = 0; i < n; i++){ + S: a[i] = a[b[i]]; +} +``` + +#### Example 3: Loop-independent dependence +Some memory dependencies in program code do not span multiple iterations of a loop. In the following example code, dependencies from statement `S2` on `S1` and from statement `S3` on `S1` are referred to as loop-independent memory dependencies. Such dependencies do not prevent the compiler from generating an efficient pipelined loop datapath and are not considered in this tutorial. +```c++ +for(i = 0; i < n; i++){ + S1: a[i] = foo(); + ... + S2: b[i] = a[i]; +} +for(j = 0; j < m; j++){ + S3: A[i] = bar(); +} +``` + +### Loop-carried dependence distance +Imagine loop-carried dependencies in terms of the distance between the dependence source and sink statements, measured in the number of iterations of the loop containing the statements. In example 1, the dependence source (store into array `a`) and dependence sink (load from the same index in array `a`) are one iteration apart. That is, for the specified memory location, the data is read one iteration after it was written. Therefore, this true dependence has a distance of 1. In many cases, the compiler loop dependence analysis may be able to statically determine the dependence distance. + +#### Example 4: Simple dependence distance +The compiler's static analysis facilities can infer that the distance of the true dependence in the following example code is 10 iterations. This has an impact on the scheduling of how iterations of the loop are issued into the generated pipelined datapath. For example, iteration `k` may not begin executing the load from array `a` before iteration `(k-10)` has completed storing the data into the same memory location. However, iterations `[k-9,k)` do not incur the scheduling constraint on the store in iteration `(k-10)` and begin execution earlier. +```c++ +for(i = 1; i < n; i++){ + S: a[i] = a[i-10]; +} +``` + +#### Example 5: Dependence distance across multiple loops in a nest +Statement `S`, in the code snippet that follows, forms two distinct true dependencies, one carried by loop `L1` and one by loop `L2`. Across iterations of loop `L1`, data is stored into a location in array `a` that is read in the next iteration. Similarly, across iterations of loop `L2`, data is stored into a location in array `a` that is read in a later iteration. In the latter case, the dependence across loop `L2` has dependence distance of 2. In the former, the dependence distance across loop `L1` has dependence distance of 1. Special care must be taken when reasoning about loop-carried memory dependencies spanning multiple loops. +```c++ +L1: for(i = 1; i < n; i++){ + L2: for(j = 1; j < m; j++){ + S: a[i][j] = a[i-1][j-2]; + } +} +``` + +### Specifying that memory accesses do *not* cause loop-carried dependencies +Apply the `ivdep` attribute to a loop to inform the compiler that ***none*** of the memory accesses within a loop incur loop-carried dependencies. +```c++ +[[intelfpga::ivdep]] +for (int i = 0; i < N; i++) { + A[i] = A[i - X[i]]; +} +``` +The `ivdep` attribute indicates to the compiler that it can disregard assumed loop-carried memory dependencies and generate a pipelined datapath for this loop capable of issuing new iterations as soon as possible (every cycle), maximizing possible throughput. + +### Specifying that memory accesses do *not* cause loop-carried dependencies across a fixed distance +Apply the `ivdep` attribute with a `safelen` parameter to set a specific lower bound on the dependence distance that can possibly be attributed to loop-carried dependencies in the associated loop. +```c++ +// n is a constant expression of integer type +[[intelfpga::ivdep(n)]] +for (int i = 0; i < N; i++) { + A[i] = A[i - X[i]]; +} +``` +The `ivdep` attribute informs the compiler to generate a pipelined loop datapath that can issue a new iteration as soon as the iteration `n` iterations ago has completed. The attribute parameter (`safelen`) is a refinement of the compiler static loop-carried dependence analysis that infers the dependence present in the code but is otherwise unable to accurately determine its distance. + +***IMPORTANT***: Applying the `ivdep` attribute or the `ivdep` attribute with a `safelen` parameter may lead to incorrect results if the annotated loop exhibits loop-carried memory dependencies. The attribute directs the compiler to generate hardware assuming no loop-carried dependencies. Specifying this assumption incorrectly is an invalid use of the attribute, and results in undefined (and likely incorrect) behavior. + +### Testing the Tutorial +In `loop_ivdep.cpp`, the `ivdep` attribute is applied to the kernel work loop with a `safelen` parameter of 1 and 128. +```c++ + TransposeAndFold(selector, A, B); // kMinSafelen = 1 + TransposeAndFold(selector, A, C); // kMaxSafelen = 128 +``` +The `ivdep` attribute with `safelen` parameter equal to 1 informs the compiler that iterations of the associated loop do not form a loop-carried memory dependence with a distance of at least 1. That is, the attribute is redundant and is equivalent to the code without the attribute in place. + +**_Try this!_**: Compile the tutorial program in `loop_ivdep.cpp` with and without the `[[intelfpga::ivdep]]` attribute altogether and compare the resulting reports. + +The `ivdep` attribute with `safelen` parameter equal to 128 is reflective of the maximum number of iterations of the associated loop among which no loop-carried memory dependence occurs. The annotated loop nest contains a dependence on values of array `temp_buffer`: + +```c++ +for (size_t j = 0; j < kMatrixSize * kRowLength; j++) { + for (size_t i = 0; i < kRowLength; i++) { + temp_buffer[j % kRowLength][i] += in_buffer[i][j % kRowLength]; + } +} +``` +Observe that the indexing expression on `temp_buffer` evaluates to the same index every `kRowLength` iterations of the `j` loop. Specifying the `ivdep` attribute on the `j` loop without a `safelen` parameter, or with a `safelen` parameter >= `kRowLength` leads to undefined behavior because the generated hardware does not adhere to the ordering constraint imposed by the dependence. Specifying the `ivdep` attribute with a `safelen` attribute <= `kRowLength` is valid and will result in a better performing end result. + +## Key Concepts +* Basics of loop-carried dependencies +* The notion of a loop-carried dependence distance +* Determining what constitutes a *safe* dependence distance +* How to aid the compiler's dependence analysis to maximize performance + +## License +This code sample is licensed under MIT license. + + +## Building the `loop_ivdep` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + +### On a Windows* System +Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. + +1. Enter the source file directory. + ``` + cd src + ``` + +2. Compile the design. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + ninja fpga_emu + ``` + + * Generate the optimization report: + + ``` + ninja report + ``` + If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use: + ``` + ninja report_s10_pac + ``` + * Compiling for FPGA hardware is not yet supported on Windows. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Examining the Reports +Locate `report.html` in the `loop_ivdep_report.prj/reports/` or `loop_ivdep_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. + +Navigate to the Loops Analysis section of the optimization report and look at the initiation interval (II) achieved by the two version of the kernel. +* **`safelen(1)`** The II reported for this version of the kernel is 5 cycles. +You should see a message similar to "Compiler failed to schedule this loop with smaller II due to memory dependency." +* **`safelen(128)`** The II reported for this version of the kernel is 1 cycle, the optimal result. You should see a message similar to "a new iteration is issued into the pipelined loop datapath on every cycle". + + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./loop_ivdep.fpga_emu (Linux) + loop_ivdep.fpga_emu.exe (Windows) + ``` +2. Run the sample on the FPGA device: + ``` + ./loop_ivdep.fpga (Linux) + ``` + +### Example of Output + +``` +SAFELEN: 1 -- kernel time : 50.9517 ms +Throughput for kernel with SAFELEN 1: 1286KB/s +SAFELEN: 128 -- kernel time : 10 ms +Throughput for kernel with SAFELEN 128: 6277KB/s +PASSED: The results are correct +``` + +### Discussion of Results + +The following table summarizes the execution time (in ms) and throughput (in MFlops) for `safelen` parameters of 1 (redundant attribute) and 128 (`kRowLength`) for a default input matrix size of 128 x 128 floats on Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA and the Intel® oneAPI DPC++ Compiler (Beta). + +Safelen | Kernel Time (ms) | Throughput (KB/s) +------------- | ------------- | ----------------------- +1 | 50 | 1320 +128 | 10 | 6403 + +With the `ivdep` attribute applied with the maximum safe `safelen` parameter, the kernel execution time is decreased by a factor of ~5. + +Note that this performance difference will be apparent only when running on FPGA hardware. The emulator, while useful for verifying functionality, will generally not reflect differences in performance. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/loop_ivdep.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/loop_ivdep.sln new file mode 100755 index 0000000000..5f1a9b42a8 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/loop_ivdep.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "loop_ivdep", "loop_ivdep.vcxproj", "{3F5364B3-F987-4676-89A5-1F19BA3D4B75}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {3F5364B3-F987-4676-89A5-1F19BA3D4B75}.Debug|x64.ActiveCfg = Debug|x64 + {3F5364B3-F987-4676-89A5-1F19BA3D4B75}.Debug|x64.Build.0 = Debug|x64 + {3F5364B3-F987-4676-89A5-1F19BA3D4B75}.Release|x64.ActiveCfg = Release|x64 + {3F5364B3-F987-4676-89A5-1F19BA3D4B75}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {C0550E85-8C31-40EE-BFFA-F267DC16329D} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/loop_ivdep.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/loop_ivdep.vcxproj new file mode 100755 index 0000000000..ed0fb51757 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/loop_ivdep.vcxproj @@ -0,0 +1,160 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {3f5364b3-f987-4676-89a5-1f19ba3d4b75} + Win32Proj + loop_ivdep + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)loop_ivdep.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)loop_ivdep.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/sample.json new file mode 100755 index 0000000000..b020452df4 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/sample.json @@ -0,0 +1,51 @@ +{ + "guid": "CD8FE0A5-B31A-4906-8386-27416361FE24", + "name": "Loop IVDep Attribute", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "FPGA tutorial demonstrating the usage of the loop ivdep attribute", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./loop_ivdep.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "loop_ivdep.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/CMakeLists.txt new file mode 100755 index 0000000000..03d22779ce --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/CMakeLists.txt @@ -0,0 +1,89 @@ +set(SOURCE_FILE loop_ivdep.cpp) +set(TARGET_NAME loop_ivdep) + +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS "-fintelfpga") + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR") +set(EMULATOR_LINK_FLAGS "-fintelfpga") + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS}) + set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) +endif() + +# generate report +if(WIN32) + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) + +else() + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/build.ninja new file mode 100755 index 0000000000..f076e77a88 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/build.ninja @@ -0,0 +1,30 @@ +source_file = loop_ivdep.cpp +target_name = loop_ivdep + +emulator_target = ${target_name}.fpga_emu.exe +report_target = ${target_name}_report.a +report_target_s10_pac = ${target_name}_s10_pac_report.a + +hardware_flags = -fintelfpga -Xshardware +emulator_flags = -fintelfpga -DFPGA_EMULATOR + +rule build_fpga_emu + command = dpcpp /GX ${emulator_flags} $in -o $out + +rule gen_report + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out + +rule gen_report_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu ${source_file} + +# report +build report: phony ${report_target} +build ${report_target}: gen_report ${source_file} + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_s10_pac} +build ${report_target_s10_pac}: gen_report_s10_pac ${source_file} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/loop_ivdep.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/loop_ivdep.cpp new file mode 100755 index 0000000000..f2ddc838ac --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/loop_ivdep.cpp @@ -0,0 +1,127 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include +#include +#include "dpc_common.hpp" + +constexpr size_t kRowLength = 128; +constexpr size_t kMinSafelen = 1; +constexpr size_t kMaxSafelen = kRowLength; +constexpr size_t kMatrixSize = kRowLength * kRowLength; + +using namespace sycl; + +template class KernelCompute; + +template +void TransposeAndFold(const device_selector &selector, + const std::array &m_input, + std::array &m_output) { + double kernel_time = 0; + try { + queue q(selector, dpc_common::exception_handler, + property::queue::enable_profiling{}); + + buffer buffer_input(m_input); + // Use verbose SYCL 1.2 syntax for the output buffer. + // (This will become unnecessary in a future compiler version.) + buffer buffer_output(m_output.data(), kMatrixSize); + + event e = q.submit([&](handler &h) { + auto accessor_input = buffer_input.get_access(h); + auto accessor_output = buffer_output.get_access(h); + + h.single_task>([=]() + [[intel::kernel_args_restrict]] { + float in_buffer[kRowLength][kRowLength]; + float temp_buffer[kRowLength][kRowLength]; + + // Initialize local buffers + for (size_t i = 0; i < kMatrixSize; i++) { + in_buffer[i / kRowLength][i % kRowLength] = accessor_input[i]; + temp_buffer[i / kRowLength][i % kRowLength] = 0; + } + + // No iterations of the following loop store data into the same memory + // location that are less than kRowLength iterations apart. + // The ivdep here instructs the compiler that it can safely assume no + // loop-carried dependencies over safe_len consecutive iterations. + [[intelfpga::ivdep(safe_len)]] + for (size_t j = 0; j < kMatrixSize * kRowLength; j++) { + #pragma unroll + for (size_t i = 0; i < kRowLength; i++) { + temp_buffer[j % kRowLength][i] += in_buffer[i][j % kRowLength]; + } + } + + // Write result to output + for (size_t i = 0; i < kMatrixSize; i++) { + accessor_output[i] = temp_buffer[i / kRowLength][i % kRowLength]; + } + }); + }); + + double start = e.get_profiling_info(); + double end = e.get_profiling_info(); + + // unit is nano second, convert to ms + kernel_time = (double)(end - start) * 1e-6; + + } catch (sycl::exception const &e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + + std::cout << "safe_len: " << safe_len << " -- kernel time : " << kernel_time + << " ms\n"; + std::cout << "Throughput for kernel with safe_len " << safe_len << ": "; + std::cout << std::fixed << std::setprecision(0) + << (((double)kMatrixSize * sizeof(float) * 1e-3f) / + (kernel_time * 1e-3f)) << "KB/s\n"; +} + +int main() { + std::array A, B, C; + + // Initialize input with random data + for (size_t i = 0; i < kMatrixSize; i++) { + A[i] = static_cast(rand()) / static_cast(RAND_MAX); + } + +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector selector; +#else + intel::fpga_selector selector; +#endif + + // Instantiate kernel logic with the min and max correct safelen parameter + // to compare performance. + TransposeAndFold(selector, A, B); + TransposeAndFold(selector, A, C); + // You can also try removing the ivdep from the kernel entirely and + // recompiling to see what effect this has on performance. + + // Verify result + for (size_t i = 0; i < kMatrixSize; i++) { + if (B[i] != C[i]) { + std::cout << "FAILED: The results are incorrect" << '\n'; + return 1; + } + } + std::cout << "PASSED: The results are correct" << '\n'; + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/CMakeLists.txt new file mode 100755 index 0000000000..a94ffc91b3 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(LoopUnroll) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/README.md new file mode 100755 index 0000000000..5c2528eeb0 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/README.md @@ -0,0 +1,188 @@ + +# Unrolling Loops +This FPGA tutorial demonstrates a simple example of unrolling loops to improve the throughput of a DPC++ FPGA program. + +***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | Basics of loop unrolling
How to unroll loops in your program
Determining the optimal unroll factor for your program +| Time to complete | 15 minutes + +_Notice: This code sample is not yet supported in Windows*_ + +## Purpose + +The loop unrolling mechanism is used to increase program parallelism by duplicating the compute logic within a loop. The number of times the loop logic is duplicated is called the *unroll factor*. Depending on whether the *unroll factor* is equal to the number of loop iterations or not, loop unroll methods can be categorized as *full-loop unrolling* and *partial-loop unrolling*. + +### Example: Full-Loop Unrolling +```c++ +// Before unrolling loop +#pragma unroll +for(i = 0 ; i < 5; i++){ + a[i] += 1; +} + +// Equivalent code after unrolling +// There is no longer any loop +a[0] += 1; +a[1] += 1; +a[2] += 1; +a[3] += 1; +a[4] += 1; +``` +A full unroll is a special case where the unroll factor is equal to the number of loop iterations. Here, the the Intel® oneAPI DPC++ Compiler for FPGA instantiates five adders instead of the one adder. + +### Example: Partial-Loop Unrolling + +```c++ +// Before unrolling loop +#pragma unroll 4 +for(i = 0 ; i < 20; i++){ + a[i] += 1; +} + +// Equivalent code after unrolling by a factor of 4 +// The resulting loop has five (20 / 4) iterations +for(i = 0 ; i < 5; i++){ + a[i * 4] += 1; + a[i * 4 + 1] += 1; + a[i * 4 + 2] += 1; + a[i * 4 + 3] += 1; +} +``` +Each loop iteration in the "equivalent code" contains four unrolled invocations of the first. The Intel® oneAPI DPC++ Compiler (Beta) for FPGA instantiates four adders instead of one adder. Because there is no data dependency between iterations in the loop in this case, the compiler schedules all four adds in parallel. + +### Determining the optimal unroll factor +In an FPGA design, unrolling loops is a common strategy to directly trade off on-chip resources for increased throughput. When selecting the unroll factor for specific loop, the intent is to improve throughput while minimizing resource utilization. It is also important to be mindful of other throughput constraints in your system, such as memory bandwidth. + +### Tutorial design +This tutorial demonstrates this trade-off with a simple vector add kernel. The tutorial shows how increasing the unroll factor on a loop increases throughput... until another bottleneck is encountered. This example is constructed to run up against global memory bandwidth constraints. + +The memory bandwidth on an Intel® Programmable Acceleration Card with Intel Arria® 10 GX FPGA system is about 6 GB/s. The tutorial design will likely run at around 300 MHz. In this design, the FPGA design processes a new iterations every cycle in a pipeline-parallel fashion. The theoretical computation limit for 1 adder is: + +**GFlops**: 300 MHz \* 1 float = 0.3 GFlops + +**Computation Bandwidth**: 300 MHz \* 1 float * 4 Bytes = 1.2 GB/s + +You repeat this back-of-the-envelope calculation for different unroll factors: + +Unroll Factor | GFlops (GB/s) | Compuation Bandwidth (GB/s) +------------- | ------------- | ----------------------- +1 | 0.3 | 1.2 +2 | 0.6 | 2.4 +4 | 1.2 | 4.8 +8 | 2.4 | 9.6 +16 | 4.8 | 19.2 + +On an Intel® Programmable Acceleration Card with Intel Arria® 10 GX FPGA, it is reasonable to predict that this program will become memory-bandwidth limited when unroll factor grows from 4 to 8. Check this prediction by running the design following the instructions below. + + +## Key Concepts +* Basics of loop unrolling. +* How to unroll loops in your program. +* Determining the optimal unroll factor for your program. + +## License +This code sample is licensed under MIT license. + + +## Building the `loop_unroll` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Examining the Reports +Locate `report.html` in the `loop_unroll_report.prj/reports/` or `loop_unroll_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. + +Navigate to the Area Report and compare the FPGA resource utilization of the kernels with unroll factors of 1, 2, 4, 8, and 16. In particular, check the number of DSP resources consumed. You should see the area grow roughly linearly with the unroll factor. + +You can also check the achieved system fMAX in order to verify the earlier calculations. + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./loop_unroll.fpga_emu (Linux) + ``` +2. Run the sample on the FPGA device: + ``` + ./loop_unroll.fpga (Linux) + ``` + +### Example of Output +``` +Input Array Size: 67108864 +UnrollFactor 1 kernel time : 255.749 ms +Throughput for kernel with UnrollFactor 1: 0.262 GFlops +UnrollFactor 2 kernel time : 140.285 ms +Throughput for kernel with UnrollFactor 2: 0.478 GFlops +UnrollFactor 4 kernel time : 68.296 ms +Throughput for kernel with UnrollFactor 4: 0.983 GFlops +UnrollFactor 8 kernel time : 44.567 ms +Throughput for kernel with UnrollFactor 8: 1.506 GFlops +UnrollFactor 16 kernel time : 39.175 ms +Throughput for kernel with UnrollFactor 16: 1.713 GFlops +PASSED: The results are correct +``` + +### Discussion of Results +The following table summarizes the execution time (in ms), throughput (in GFlops), and number of DSPs used for unroll factors of 1, 2, 4, 8, and 16 for a default input array size of 64M floats (2 ^ 26 floats) on Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA: + +Unroll Factor | Kernel Time (ms) | Throughput (GFlops) | Num of DSPs +------------- | ------------- | -----------------------| ------- +1 | 242 | 0.277 | 1 +2 | 127 | 0.528 | 2 +4 | 63 | 1.065 | 4 +8 | 46 | 1.459 | 8 +16 | 44 | 1.525 | 16 + +Notice that when the unroll factor increases from 1 to 2 and from 2 to 4, the kernel execution time decreases by a factor of two. Correspondingly, the kernel throughput doubles. However, when the unroll factor is increase from 4 to 8 and from 8 to 16, the throughput does no longer scales by a factor of two at each step. The design is now bound by memory bandwidth limitations instead of compute unit limitations even though the hardware is replicated. + +These performance differences will be apparent only when running on FPGA hardware. The emulator, while useful for verifying functionality, will generally not reflect differences in performance. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/sample.json new file mode 100755 index 0000000000..3863df9d59 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/sample.json @@ -0,0 +1,34 @@ +{ + "guid": "2760C1B6-25E5-4280-9F8F-34CA8DDEDA7C", + "name": "Unrolling Loops", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "FPGA tutorial design demonstrating the loop_unroll pragma", + "toolchain": ["dpcpp"], + "os": ["linux"], + "targetDevice": ["FPGA"], + "builder": ["cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./loop_unroll.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/src/CMakeLists.txt new file mode 100755 index 0000000000..3ca0487ff3 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/src/CMakeLists.txt @@ -0,0 +1,89 @@ +set(SOURCE_FILE loop_unroll.cpp) +set(TARGET_NAME loop_unroll) + +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS "-fintelfpga") + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR") +set(EMULATOR_LINK_FLAGS "-fintelfpga") + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS}) + set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) +endif() + +# generate report +if(WIN32) + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) + +else() + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/src/loop_unroll.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/src/loop_unroll.cpp new file mode 100755 index 0000000000..bab7954bea --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_unroll/src/loop_unroll.cpp @@ -0,0 +1,138 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include +#include +#include +#include "dpc_common.hpp" + +using namespace sycl; + +template class VAdd; + +// This function instantiates the vector add kernel, which contains +// a loop that adds up the two summand arrays and stores the result +// into sum. This loop will be unrolled by the specified unroll_factor. +template +void VecAdd(const std::vector &summands1, + const std::vector &summands2, std::vector &sum, + size_t array_size) { + + +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector device_selector; +#else + intel::fpga_selector device_selector; +#endif + + try { + queue q(device_selector, dpc_common::exception_handler, + property::queue::enable_profiling{}); + + buffer buffer_summands1(summands1); + buffer buffer_summands2(summands2); + // Use verbose SYCL 1.2 syntax for the output buffer. + // (This will become unnecessary in a future compiler version.) + buffer buffer_sum(sum.data(), array_size); + + event e = q.submit([&](handler &h) { + auto acc_summands1 = buffer_summands1.get_access(h); + auto acc_summands2 = buffer_summands2.get_access(h); + auto acc_sum = buffer_sum.get_access(h); + + h.single_task>([=]() + [[intel::kernel_args_restrict]] { + // Unroll the loop fully or partially, depending on unroll_factor + #pragma unroll unroll_factor + for (size_t i = 0; i < array_size; i++) { + acc_sum[i] = acc_summands1[i] + acc_summands2[i]; + } + }); + }); + + double start = e.get_profiling_info(); + double end = e.get_profiling_info(); + // convert from nanoseconds to ms + double kernel_time = (double)(end - start) * 1e-6; + + std::cout << "unroll_factor " << unroll_factor + << " kernel time : " << kernel_time << " ms\n"; + std::cout << "Throughput for kernel with unroll_factor " << unroll_factor + << ": "; + std::cout << std::fixed << std::setprecision(3) + << ((double)array_size / kernel_time) / 1e6f << " GFlops\n"; + + } catch (sycl::exception const &e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } +} + +int main(int argc, char *argv[]) { + size_t array_size = 1 << 26; + + if (argc > 1) { + std::string option(argv[1]); + if (option == "-h" || option == "--help") { + std::cout << "Usage: \n \n\nFAILED\n"; + return 1; + } else { + array_size = std::stoi(option); + } + } + + std::vector summands1(array_size); + std::vector summands2(array_size); + + std::vector sum_unrollx1(array_size); + std::vector sum_unrollx2(array_size); + std::vector sum_unrollx4(array_size); + std::vector sum_unrollx8(array_size); + std::vector sum_unrollx16(array_size); + + // Initialize the two summand arrays (arrays to be added to each other) to + // 1:N and N:1, so that the sum of all elements is N + 1 + for (size_t i = 0; i < array_size; i++) { + summands1[i] = static_cast(i + 1); + summands2[i] = static_cast(array_size - i); + } + + std::cout << "Input Array Size: " << array_size << "\n"; + + // Instantiate VecAdd kernel with different unroll factors: 1, 2, 4, 8, 16 + // The VecAdd kernel contains a loop that adds up the two summand arrays. + // This loop will be unrolled by the specified unroll factor. + // The sum array is expected to be identical, regardless of the unroll factor. + VecAdd<1>(summands1, summands2, sum_unrollx1, array_size); + VecAdd<2>(summands1, summands2, sum_unrollx2, array_size); + VecAdd<4>(summands1, summands2, sum_unrollx4, array_size); + VecAdd<8>(summands1, summands2, sum_unrollx8, array_size); + VecAdd<16>(summands1, summands2, sum_unrollx16, array_size); + + // Verify that the output data is the same for every unroll factor + for (size_t i = 0; i < array_size; i++) { + if (sum_unrollx1[i] != summands1[i] + summands2[i] || + sum_unrollx1[i] != sum_unrollx2[i] || + sum_unrollx1[i] != sum_unrollx4[i] || + sum_unrollx1[i] != sum_unrollx8[i] || + sum_unrollx1[i] != sum_unrollx16[i]) { + std::cout << "FAILED: The results are incorrect\n"; + return 1; + } + } + std::cout << "PASSED: The results are correct\n"; + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/CMakeLists.txt new file mode 100755 index 0000000000..e281d8cb1a --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(MaxConcurrency) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/README.md new file mode 100755 index 0000000000..68d681dd31 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/README.md @@ -0,0 +1,172 @@ +# Maximum Concurrency of a Loop +This FPGA tutorial explains how to use the max_concurrency attribute for loops. + +***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | The basic usage of the `max_concurrency` attribute
How the `max_concurrency` attribute affects loop throughput and resource use
How to apply the `max_concurrency` attribute to loops in your program
How to identify the correct `max_concurrency` factor for your program +| Time to complete | 15 minutes + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + +## Purpose +This tutorial demonstrates a simple example of applying the `max_concurrency` attribute to a loop in a task kernel to trade off the on-chip memory use and throughput of the loop. + +### Description of the `max_concurrency` Attribute +The `max_concurrency` attribute is a loop attribute that enables you to control the number of simultaneously executed loop iterations. To enable this simultaneous execution, the compiler creates copies of any memory that is private to a single iteration. These copies are called private copies. The greater the permitted concurrency, the more private copies the compiler must create. + +#### Example: + +Kernels in this tutorial design apply `[[intelfpga::max_concurrency(N)]]` to an outer loop that contains two inner loops, which perform a partial sum computation on an input array, storing the results in a private (to the outer loop) array `a1`. The following is an example of a loop nest: + +``` +[[intelfpga::max_concurrency(1)]] +for (size_t i = 0; i < max_iter; i++) { + float a1[size]; + for (int j = 0; j < size; j++) + a1[j] = accessorA[i * 4 + j] * shift; + for (int j = 0; j < size; j++) + result += a1[j]; +} +``` + +In this example, the maximum concurrency allowed for the outer loop is 1, that is, only one iteration of the outer loop is allowed to be simultaneously executing at any given moment. The `max_concurrency` attribute in this example forces the compiler to create exactly one private copy of the array `a1`. Passing the parameter `N` to the `max_concurrency` attribute limits the concurrency of the loop to `N` simultaneous iterations, and `N` private copies of privately-declared arrays in that loop. + +### Identifying the Correct `max_concurrency` Factor +Generally, increasing the maximum concurrency allowed for a loop through the use of the `max_concurrency` attribute increases the throughput of that loop at the cost of increased memory resource use. Additionally, in nearly all cases, there is a point at which increasing the maximum concurrency does not have any further effect on the throughput of the loop, as the maximum exploitable concurrency of that loop has been achieved. + +The correct `max_concurrency` factor for a loop depends on the goals of your design, the criticality of the loop in question, and its impact on the overall throughput of your design. A typical design flow may be to: +1. Experiment with different values of `max_concurrency`. +2. Observe what impact the values have on the overall throughput and memory use of your design. +3. Choose the appropriate value that allows you to achive your desired throughput and area goals. + +## Key Concepts +* The basic usage of the `max_concurrency` attribute +* How the `max_concurrency` attribute affects loop throughput and resource use +* How to apply the `max_concurrency` attribute to loops in your program +* How to identify the correct `max_concurrency` factor for your program + +## License +This code sample is licensed under MIT license. + +## Building the `max_concurrency` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the FPGA hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + +### On a Windows* System +Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. + +1. Enter the source file directory. + ``` + cd src + ``` + +2. Compile the design. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + ninja fpga_emu + ``` + + * Generate the optimization report: + + ``` + ninja report + ``` + If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use: + ``` + ninja report_s10_pac + ``` + * Compiling for FPGA hardware is not yet supported on Windows. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Examining the Reports +Locate `report.html` in the `max_concurrency_report.prj/reports/` or `max_concurrency_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. + +On the main report page, scroll down to the section titled "Estimated Resource Usage". Each kernel name ends in the max_concurrency attribute argument used for that kernel, e.g., `kernelCompute1` uses a max_concurrency attribute value of 1. You can verify that the number of RAMs used for each kernel increases with the max_concurrency value used, with the exception of max_concurrency 0, which instructs the compiler to choose a default value. + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./max_concurrency.fpga_emu (Linux) + max_concurrency.fpga_emu.exe (Windows) + ``` +2. Run the sample on the FPGA device: + ``` + ./max_concurrency.fpga (Linux) + ``` + + +### Example of Output +``` +Max concurrency 0 kernel time : 1459.89 ms +Throughput for kernel with max_concurrency 0: 0.561 GFlops +Max concurrency 1 kernel time : 2890.810 ms +Throughput for kernel with max_concurrency 1: 0.283 GFlops +Max concurrency 2 kernel time : 1460.227 ms +Throughput for kernel with max_concurrency 2: 0.561 GFlops +Max concurrency 4 kernel time : 1459.970 ms +Throughput for kernel with max_concurrency 4: 0.561 GFlops +Max concurrency 8 kernel time : 1460.034 ms +Throughput for kernel with max_concurrency 8: 0.561 GFlops +Max concurrency 16 kernel time : 1459.901 ms +Throughput for kernel with max_concurrency 16: 0.561 GFlops +PASSED: The results are correct +``` + +### Discussion of Results + +The stdout output shows the giga-floating point operations per second (GFlops) for each kernel. + +When run on the Intel® PAC with Intel Arria10® 10 GX FPGA hardware board, we see that the throughput doubles from using max_concurrency 1 to max_concurrency 2, after which increasing the value of max_concurrency does not increase the GFlops achieved, i.e., increasing the max_concurrency above 2 will spend additional RAM resources for no additional throughput gain. As such, for this tutorial design, maximal throughput is best achieved by using max_concurrency 2. + +Using max_concurrency 0 (or equivalently omitting the attribute entirely) also produced good throughput, indicating that the compiler's default heuristic chose a concurrency of 2 or higher in this case. + +When run on the FPGA emulator, the max_concurrency attribute has no effect on runtime. You may notice that the emulator achieved higher throughput than the FPGA in this example. This is because this trivial example uses only a tiny fraction of the spacial compute resources available on the FPGA. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/max_concurrency.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/max_concurrency.sln new file mode 100755 index 0000000000..761fdc2009 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/max_concurrency.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "max_concurrency", "max_concurrency.vcxproj", "{F0CE4972-62AF-4B9F-996F-1D1DB14D76B7}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {F0CE4972-62AF-4B9F-996F-1D1DB14D76B7}.Debug|x64.ActiveCfg = Debug|x64 + {F0CE4972-62AF-4B9F-996F-1D1DB14D76B7}.Debug|x64.Build.0 = Debug|x64 + {F0CE4972-62AF-4B9F-996F-1D1DB14D76B7}.Release|x64.ActiveCfg = Release|x64 + {F0CE4972-62AF-4B9F-996F-1D1DB14D76B7}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {3247AB7C-282F-4907-B1F4-E944349A8835} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/max_concurrency.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/max_concurrency.vcxproj new file mode 100755 index 0000000000..49b65a1722 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/max_concurrency.vcxproj @@ -0,0 +1,160 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {f0ce4972-62af-4b9f-996f-1d1db14d76b7} + Win32Proj + max_concurrency + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)max_concurrency.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)max_concurrency.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/sample.json new file mode 100755 index 0000000000..9a9253ef17 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/sample.json @@ -0,0 +1,51 @@ +{ + "guid": "7909FAE1-D3D4-4E97-A963-14A884F33495", + "name": "Maximum Concurrency of a Loop", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "How to use the max_concurrency attribute for single_task loops", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./max_concurrency.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "max_concurrency.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/CMakeLists.txt new file mode 100755 index 0000000000..24d6d8302a --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/CMakeLists.txt @@ -0,0 +1,90 @@ +set(SOURCE_FILE max_concurrency.cpp) +set(TARGET_NAME max_concurrency) + +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +set(AOC_SEED_FLAG "-Xsseed=4 -Xsparallel=2") +set(HARDWARE_COMPILE_FLAGS "-fintelfpga") + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${AOC_SEED_FLAG} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR") +set(EMULATOR_LINK_FLAGS "-fintelfpga") + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS}) + set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) +endif() + +# generate report +if(WIN32) + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) + +else() + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/build.ninja new file mode 100755 index 0000000000..b3a66f686b --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/build.ninja @@ -0,0 +1,30 @@ +source_file = max_concurrency.cpp +target_name = max_concurrency + +emulator_target = ${target_name}.fpga_emu.exe +report_target = ${target_name}_report.a +report_target_s10_pac = ${target_name}_s10_pac_report.a + +hardware_flags = -fintelfpga -Xshardware -Xsseed=3 +emulator_flags = -fintelfpga -DFPGA_EMULATOR + +rule build_fpga_emu + command = dpcpp /GX ${emulator_flags} $in -o $out + +rule gen_report + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out + +rule gen_report_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu ${source_file} + +# report +build report: phony ${report_target} +build ${report_target}: gen_report ${source_file} + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_s10_pac} +build ${report_target_s10_pac}: gen_report_s10_pac ${source_file} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/max_concurrency.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/max_concurrency.cpp new file mode 100755 index 0000000000..cec706dd17 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/max_concurrency.cpp @@ -0,0 +1,187 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include +#include +#include +#include "dpc_common.hpp" + +using namespace sycl; + +constexpr size_t kSize = 8192; +constexpr size_t kMaxIter = 50000; +constexpr size_t kTotalOps = 2 * kMaxIter * kSize; +constexpr size_t kMaxValue = 128; + +using FloatArray = std::array; +using FloatScalar = std::array; + +template class Compute; + +// Launch a kernel on the device specified by selector. +// The kernel's functionality is designed to show the +// performance impact of the max_concurrency attribute. +template +void PartialSumWithShift(const device_selector &selector, + const FloatArray &array, float shift, + FloatScalar &result) { + double kernel_time = 0.0; + + try { + + queue q(selector, dpc_common::exception_handler, + property::queue::enable_profiling{}); + + buffer buffer_array(array); + buffer buffer_result(result.data(), 1); + + event e = q.submit([&](handler &h) { + auto accessor_array = buffer_array.get_access(h); + auto accessor_result = buffer_result.get_access(h); + + h.single_task>([=]() + [[intel::kernel_args_restrict]] { + float r = 0; + + // At most concurrency iterations of the outer loop will be + // active at one time. + // This limits memory usage, since each iteration of the outer + // loop requires its own copy of a1. + [[intelfpga::max_concurrency(concurrency)]] + for (size_t i = 0; i < kMaxIter; i++) { + float a1[kSize]; + for (size_t j = 0; j < kSize; j++) + a1[j] = accessor_array[(i * 4 + j) % kSize] * shift; + for (size_t j = 0; j < kSize; j++) + r += a1[j]; + } + accessor_result[0] = r; + }); + }); + + // SYCL event profiling allows the kernel execution to be timed + double start = e.get_profiling_info(); + double end = e.get_profiling_info(); + kernel_time = (double)(end - start) * 1e-6; + + } catch (sycl::exception const &e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + + // The performance of the kernel is measured in GFlops, based on: + // 1) the number of floating-point operations performed by the kernel. + // This can be calculated easily for the simple example kernel. + // 2) the kernel execution time reported by SYCL event profiling. + std::cout << "Max concurrency " << concurrency << " " + << "kernel time : " << kernel_time << " ms\n"; + std::cout << "Throughput for kernel with max_concurrency " << concurrency + << ": "; + std::cout << std::fixed << std::setprecision(3) + << ((double)(kTotalOps) / kernel_time) / 1e6f << " GFlops\n"; +} + +// Calculates the expected results. Used to verify that the kernel +// is functionally correct. +float GoldenResult(const FloatArray &A, float shift) { + float gr = 0; + for (size_t i = 0; i < kMaxIter; i++) { + float a1[kSize]; + for (size_t j = 0; j < kSize; j++) + a1[j] = A[(i * 4 + j) % kSize] * shift; + for (size_t j = 0; j < kSize; j++) + gr += a1[j]; + } + return gr; +} + +int main() { + bool success = true; + + FloatArray A; + FloatScalar R0, R1, R2, R3, R4, R5; + + float shift = (float)(rand() % kMaxValue); + + // initialize the input data + for (size_t i = 0; i < kSize; i++) + A[i] = rand() % kMaxValue; + +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector selector; +#else + intel::fpga_selector selector; +#endif + + // Run the kernel with different values of the max_concurrency + // attribute, to determine the optimal concurrency. + // In this case, the optimal max_concurrency is 2 since this + // achieves the highest GFlops. Higher values of max_concurrency + // consume additional RAM without increasing GFlops. + PartialSumWithShift<0>(selector, A, shift, R0); + PartialSumWithShift<1>(selector, A, shift, R1); + PartialSumWithShift<2>(selector, A, shift, R2); + PartialSumWithShift<4>(selector, A, shift, R3); + PartialSumWithShift<8>(selector, A, shift, R4); + PartialSumWithShift<16>(selector, A, shift, R5); + + // compute the actual result here + float gr = GoldenResult(A, shift); + + // verify the results are correct + if (gr != R0[0]) { + std::cout << "Max Concurrency 0: mismatch: " << R0[0] << " != " << gr + << " (kernel != expected)" << '\n'; + success = false; + } + + if (gr != R1[0]) { + std::cout << "Max Concurrency 1: mismatch: " << R1[0] << " != " << gr + << " (kernel != expected)" << '\n'; + success = false; + } + + if (gr != R2[0]) { + std::cout << "Max Concurrency 2: mismatch: " << R2[0] << " != " << gr + << " (kernel != expected)" << '\n'; + success = false; + } + + if (gr != R3[0]) { + std::cout << "Max Concurrency 4: mismatch: " << R3[0] << " != " << gr + << " (kernel != expected)" << '\n'; + success = false; + } + + if (gr != R4[0]) { + std::cout << "Max Concurrency 8: mismatch: " << R4[0] << " != " << gr + << " (kernel != expected)" << '\n'; + success = false; + } + + if (gr != R5[0]) { + std::cout << "Max Concurrency 16: mismatch: " << R5[0] << " != " << gr + << " (kernel != expected)" << '\n'; + success = false; + } + + if (success) { + std::cout << "PASSED: The results are correct\n"; + return 0; + } + + return 1; +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/CMakeLists.txt new file mode 100755 index 0000000000..125d32c072 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(MemoryAttributesOverview) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/README.md new file mode 100755 index 0000000000..1884982c77 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/README.md @@ -0,0 +1,277 @@ + +# On-Chip Memory Attributes +This FPGA tutorial demonstrates how to use on-chip memory attributes to control memory structures in your DPC++ program. + +***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | The basic concepts of on-chip memory attributes
How to apply memory attributes in your program
How to confirm that the memory attributes were respected by the compiler
A case study of the type of performance/area trade-offs enabled by memory attributes +| Time to complete | 30 minutes + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + +## Purpose +For each private or local array in your DPC++ FPGA device code, the Intel® oneAPI DPC++ Compiler creates a custom memory system in your program's datapath to contain the contents of that array. The compiler has many options to choose from when architecting this on-chip memory structure. Memory attributes are a set of DPC++ extensions for FPGA that enable you to override the compiler's internal heuristics and to control the architecture of kernel memory. + +### Introduction to Memory Attributes + +To maximize kernel throughput, your design's datapath should have stall-free accesses to all of its memory systems. A memory read or write is said to be *stall-free* if the compiler can prove that it has contention-free access to a memory port. A memory system is stall-free if all of its accesses have this property. Wherever possible, the compiler will try to create a minimum-area, stall-free memory system. + +If a different area performance trade-off is desired, or if the compiler fails to find the best configuration, you can use memory attributes to override the compiler’s decisions and specify the memory configuration you need. + +Memory attributes can be applied to any variable or array defined within the kernel and to struct data members in struct declarations. The compiler supports the following memory attributes: + +| Memory Attribute | Description +--- |--- +| intelfpga::register | Forces a variable or array to be carried through the pipeline in registers. +| intelfpga::memory("`impl_type`") | Forces a variable or array to be implemented as embedded memory. The optional string parameter `impl_type` can be `BLOCK_RAM` or `MLAB`. +| intelfpga::numbanks(N) | Specifies that the memory implementing the variable or array must have N memory banks. +| intelfpga::bankwidth(W) | Specifies that the memory implementing the variable or array must be W bytes wide. +| intelfpga::singlepump | Specifies that the memory implementing the variable or array should be clocked at the same rate as the accesses to it. +| intelfpga::doublepump | Specifies that the memory implementing the variable or array should be clocked at twice the rate as the accesses to it. +| intelfpga::max_replicates(N) | Specifies that a maximum of N replicates should be created to enable simultaneous reads from the datapath. +| intelfpga::private_copies(N) | Specifies that a maximum of N private copies should be created to enable concurrent execution of N pipelined threads. +| intelfpga::simple_dual_port | Specifies that the memory implementing the variable or array should have no port that services both reads and writes. +| intelfpga::merge("`key`", "`type`") | Merge two or more variables or arrays in the same scope width-wise or depth-wise. All variables with the same `key` string are merged into the same memory system. The string `type` can be either `width` or `depth`. +| intelfpga::bank_bits(b0,b1,...,bn) | Specifies that the local memory addresses should use bits (b0,b1,...,bn) for bank-selection, where (b0,b1,...,bn) are indicated in terms of word-addressing. The bits of the local memory address not included in (b0,b1,...,bn) will be used for word-selection in each bank. + + +#### Example 1: Applying memory attributes to private arrays +```c++ +q.submit([&](handler &h) { + h.single_task([=]() { + // Create a kernel memory 8 bytes wide (2 integers per memory word) + // and split the contents into 2 banks (each bank will contain 32 + // integers in 16 memory words). + [[intelfpga::bankwidth(8), intelfpga::numbanks(2)]] int a[64]; + + // Force array 'b' to be carried live in the data path using + // registers. + [[intelfpga::register]] int b[64]; + + // Merge 'mem_A' and 'mem_B' width-wise so that they are mapped + // to the same kernel memory system, + [[intelfpga::merge("mem", "width")]] unsigned short mem_A[64]; + [[intelfpga::merge("mem", "width")]] unsigned short mem_B[64]; + + // ... + }); +}); + +``` + +#### Example 2: Applying memory attributes to struct data members +```c++ +// Memory attributes can be specified for struct data members +// within the struct declaration. +struct State { + [[intelfpga::numbanks(2)]] int mem[64]; + [[intelfpga::register]] int reg[8]; +}; + +q.submit([&](handler &h) { + h.single_task([=]() { + // The compiler will create two memory systems from S1: + // - S1.mem[64] implemented in kernel memory that has 2 banks + // - S1.reg[8] implemented in registers + State S1; + + // In this case, we have attributes on struct declaration as + // well as struct instantiation. When this happpens, the outer + // level attribute takes precendence. Here, the compiler will + // generate a single memory system for S2 which will have 4 + // banks. + [[intelfpga::numbanks(4)]] State S2; + + // ... + }); +}); + +``` + +### Tutorial Code Overview +This tutorial demonstrates the trade-offs between choosing a single-pumped and double-pumped memory system for your kernel. We will apply the attributes `[[intelfpga::singlepump]]` and `[[intelfpga::doublepump]]` to the two dimensional array `dict_offset`. + +The tutorial enqueues three versions of the same kernel: +* `dict_offset` is single-pumped +* `dict_offset` is double-pumped +* `dict_offset` unconstrained (compiler heuristics choose the memory configuration) + +For both single-pumped and double-pumped versions, additional memory attributes direct the compiler to implement `dict_offset` in MLABs (as the size of the array is small), to using `kVec` banks, and to confine the number of replicates in each bank to no more than `kVec`. + +### Accesses to `dict_offset` + +Array `dict_offset` has the following accesses: + + * **Initialization**: It is initialized by copying the contents of global memory `dict_offset_init` using `kVec` writes. + * **Reads** : It is read from `kVec*kVec` times. + * **Writes**: There are `kVec` writes updating the values at some indices. + +After all loops are unrolled, the innermost dimension of every access is known at compile time (e.g. `dict_offset[i][k]` becomes `dict_offset[i][0]`, `dict_offset[i][1]`, etc.). + +### Banks and replicates of `dict_offset` + +If we partition the memory system such that array elements `dict_offset[:][0]` (where `:` denotes all indices in range) are contained in Bank 0, `dict_offset[:][1]` are contained in Bank 1, and so on, each access is confined to a single bank. This partitioning is achieved by requesting the compiler to generate `kVec` banks. + +In total, there are `kVec` reads from each bank. To make these reads stall-free, we request `kVec` replicates per bank so that (if needed) each read can occur simultaneously from a separate replicate. Since all replicates in a bank must contain identical data, a write to a bank must go to all replicates. + +For single-pumped memories, each replicate has 2 physical ports. In the tutorial code, one of these ports is used for writing and one for reading. The compiler must generate `kVec` replicates per bank to create stall-free accesses for `kVec` reads. + +For double-pumped memories, each replicate effectively has 4 ports, three of which are available for reads. Hence, the compiler needs fewer replicates per bank to create stall-free reads. However, this can incur a system fMAX penalty. + +The choice of attributes will be further discussed in the [Examining the Reports](#examining-the-reports) section. + + +## Key Concepts +* The basic concepts of on-chip memory attributes +* How to apply memory attributes in your program +* How to confirm that the memory attributes were respected by the compiler +* A case study of the type of performance/area trade-offs enabled by memory attributes + +## License +This code sample is licensed under MIT license. + +## Building the `memory_attributes` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + +### On a Windows* System +Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. + +1. Enter the source file directory. + ``` + cd src + ``` + +2. Compile the design. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + ninja fpga_emu + ``` + + * Generate the optimization report: + + ``` + ninja report + ``` + If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use: + ``` + ninja report_s10_pac + ``` + * Compiling for FPGA hardware is not yet supported on Windows. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Examining the Reports +Locate `report.html` in the `memory_attributes_report.prj/reports/` or `memory_attributes_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. + +Navigate to the Kernel Memory Viewer (System Viewers > Kernel Memory Viewer). In the Kernel Memory List pane, click on `dict_offset` under the function `KernelCompute`, for each of +* N=0 : unconstrained configuration (compiler's choice) +* N=1 : single-pumped configuration +* N=2 : double-pumped configuration + +This view provides information about the memory configuration. The user-specified memory attributes are listed in the "Details" pane. + +### Comparing the memory configurations + +For both single-pumped and double-pumped versions of the kernel, the compiler generates `kVec` banks and implements the memory in MLABs, as was requested through memory attributes. The main difference between these two memory systems is the number of replicates within each bank. To see the number of replicates per bank, click any bank label (say Bank 0) under `dict_offset`. + +For the single-pumped memory system, the compiler created 4 replicates per bank, whereas for the double-pumped memory system, the compiler created 2 replicates per bank. A single-pumped replicate has 2 physical ports and a double-pumped replicates has 4 (effective) physical ports. For this reason, the compiler required twice as many replicates to create a stall-free system in the single-pumped version as compared to the double-pumped version. + +### Area implications + +This also means that the FPGA resources needed to generate the stall-free memory systems differ between the two versions. In the report, navigate to the Area Analysis of System view (Area Analysis > Area Analysis of System) and click "Expand All". For the single-pumped version, you can see that the compiler used 32 MLABs to implement the memory system for `dict_offset`, whereas for the double-pumped version, the compiler used only 16 MLABs. However, the double-pumped version of the memory required additional ALUTs and FFs to implement the double-pumping logic. + +In general, double-pumped memories are more area-efficient than single-pumped memories. + +### fMAX implications + +The use of double-pumped memories can impact the fMAX of your system. Double-pumped memories have to be clocked at twice the frequency of the rest of the datapath, and the resulting cross-clock domain transfer can reduce fMAX. The effect is particularly pronounced when double-pumping MLABs. + +In this tutorial, both the single-pumped and double-pumped version of the kernel share a single clock domain, so the difference in fMAX cannot be directly observed in the report. + +If you want to observe the fMAX effect, modify the code to enqueue only the single-pumped (or only the double-pumped) version of the kernel. Only the report generated from a full FPGA compile (`make fpga`) will provide fMAX information. + +The table that follows summarizes the fMAX achieved when compiling single-kernel variants of the tutorial design to an on Intel® PAC with Intel® Arria® 10 GX FPGA. + +Variant | Fmax (MHz) | \# MLABs in `dict_offset` +------------- | ------------- | -------- +Single-pumped | 307.9 | 32 +Double-pumped | 200.0 | 16 + +Note that the numbers reported in the table will vary slightly from compile to compile. + +### Trade-offs +There are often many ways to generate a stall-free memory system. As a programmer, the implementation you choose depends on your design constraints. + + - If your design is limited by the available memory resources (block RAMs and MLABs), using double-pumped memory systems can help your design fit in the FPGA device. + - If the fMAX of your design is limited by double-pumped memory systems in your kernel, forcing all memory systems to be single-pumped might increase the fMAX. + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./memory_attributes.fpga_emu (Linux) + memory_attributes.fpga_emu.exe (Windows) + ``` +2. Run the sample on the FPGA device: + ``` + ./memory_attributes.fpga (Linux) + ``` + +### Example of Output +``` +PASSED: all kernel results are correct. +``` + +### Discussion + +Feel free to experiment further with the tutorial code. You can: + - Change the memory implementation type to block RAMs (using `[[intelfpga::memory("BLOCK_RAM")]]`) or registers (using `[[intelfpga::register]]`) to see how it affects the area and fMAX of the tutorial design. + - Vary `kRows` and/or `kVec` (both in powers of 2) to see how it effects the trade-off between single-pumped and double-pumped memories. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/memory_attributes.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/memory_attributes.sln new file mode 100755 index 0000000000..3f1de9b8b8 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/memory_attributes.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "memory_attributes", "memory_attributes.vcxproj", "{D6A634E7-9F2B-46C2-A21C-2402F631A55A}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {D6A634E7-9F2B-46C2-A21C-2402F631A55A}.Debug|x64.ActiveCfg = Debug|x64 + {D6A634E7-9F2B-46C2-A21C-2402F631A55A}.Debug|x64.Build.0 = Debug|x64 + {D6A634E7-9F2B-46C2-A21C-2402F631A55A}.Release|x64.ActiveCfg = Release|x64 + {D6A634E7-9F2B-46C2-A21C-2402F631A55A}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {4AC13DD2-5B0F-4051-93BF-85AEAF6E50C9} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/memory_attributes.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/memory_attributes.vcxproj new file mode 100755 index 0000000000..f797c91ef5 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/memory_attributes.vcxproj @@ -0,0 +1,160 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {d6a634e7-9f2b-46c2-a21c-2402f631a55a} + Win32Proj + memory_attributes + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)memory_attributes.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)memory_attributes.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/sample.json new file mode 100755 index 0000000000..8c18593331 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/sample.json @@ -0,0 +1,51 @@ +{ + "guid": "31BCA673-F514-4E2E-A8B3-A0B42D63884C", + "name": "On-Chip Memory Attributes", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "FPGA tutorial demonstrating the use of memory attributes to control memory structures in a DPC++ program.", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./memory_attributes.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "memory_attributes.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/CMakeLists.txt new file mode 100755 index 0000000000..290fd004e5 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/CMakeLists.txt @@ -0,0 +1,96 @@ +set(SOURCE_FILE memory_attributes.cpp) +set(TARGET_NAME memory_attributes) +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS "-fintelfpga") + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR") +set(EMULATOR_LINK_FLAGS "-fintelfpga") + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) + +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + + set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS "${HARDWARE_COMPILE_FLAGS}") + set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) + +endif() + +# report + +if(WIN32) + set(REPORT ${TARGET_NAME}_report.a) + + add_custom_target(report DEPENDS ${REPORT}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}") + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${CMAKE_BINARY_DIR}/${TARGET_NAME}/${SOURCE_FILE} COPYONLY) + + add_custom_command(OUTPUT ${REPORT} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_BINARY_DIR}/${TARGET_NAME}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORT} + DEPENDS ${SOURCE_FILE}) + +else() + set(REPORT ${TARGET_NAME}_report.a) + + add_custom_target(report DEPENDS ${REPORT}) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${REPORT} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORT} + DEPENDS ${SOURCE_FILE}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/build.ninja new file mode 100755 index 0000000000..5a8b871482 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/build.ninja @@ -0,0 +1,41 @@ +source_file = memory_attributes.cpp +target_name = memory_attributes + +emulator_target = ${target_name}.fpga_emu.exe + +hardware_flags = -fintelfpga -Xshardware +emulator_flags = -fintelfpga -DFPGA_EMULATOR + +rule build_fpga_emu + command = dpcpp /GX ${emulator_flags} $in -o $out + +rule gen_report_1x + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -DSINGLEPUMP -o $out + +rule gen_report_2x + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -DDOUBLEPUMP -o $out + +rule gen_report_1x_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -DSINGLEPUMP -o $out + +rule gen_report_2x_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -DDOUBLEPUMP -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu ${source_file} + +# report +report_target_1x = singlepump_report.a +report_target_2x = doublepump_report.a +report_target_1x_s10_pac = singlepump_s10_pac_report.a +report_target_2x_s10_pac = doublepump_s10_pac_report.a + +build report: phony ${report_target_1x} ${report_target_2x} +build ${report_target_1x}: gen_report_1x ${source_file} +build ${report_target_2x}: gen_report_2x ${source_file} + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_1x_s10_pac} ${report_target_2x_s10_pac} +build ${report_target_1x_s10_pac}: gen_report_1x_s10_pac ${source_file} +build ${report_target_2x_s10_pac}: gen_report_2x_s10_pac ${source_file} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/memory_attributes.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/memory_attributes.cpp new file mode 100755 index 0000000000..f1fa9afb3a --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/memory_attributes.cpp @@ -0,0 +1,227 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include "dpc_common.hpp" + +using namespace sycl; + +constexpr size_t kRows = 8; +constexpr size_t kVec = 4; +constexpr size_t kMaxVal = 512; +constexpr size_t kNumTests = 64; +constexpr size_t kMaxIter = 8; + +// Forward declaration of the kernel name +// (This will become unnecessary in a future compiler version.) +template +class KernelCompute; + +using UintArray = std::array; +using Uint2DArray = std::array, kRows>; +using UintSQArray = std::array, kVec>; // square + +// The shared compute function for host and device code +size_t Compute(unsigned init, Uint2DArray &dict_offset) { + + // We do not provide any attributes for compare_offset and hash; + // we let the compiler decide what's best based on the access pattern + // and their size. + UintSQArray compare_offset; + UintArray hash; + + #pragma unroll + for (size_t i = 0; i < kVec; i++) { + hash[i] = (++init) & (kRows - 1); + } + + size_t count = 0, iter = 0; + do { + // After unrolling both loops, we have kVec*kVec reads from dict_offset + #pragma unroll + for (size_t i = 0; i < kVec; i++) { + #pragma unroll + for (size_t k = 0; k < kVec; ++k) { + compare_offset[k][i] = dict_offset[hash[i]][k]; + } + } + + // After unrolling, we have kVec writes to dict_offset + #pragma unroll + for (size_t k = 0; k < kVec; ++k) { + dict_offset[hash[k]][k] = (init << k); + } + init++; + + #pragma unroll + for (size_t i = 0; i < kVec; i++) { + #pragma unroll + for (size_t k = 0; k < kVec; ++k) { + count += compare_offset[i][k]; + } + } + } while (++iter < kMaxIter); + return count; +} + +// Declare a 2D array with memory attribute 'doublepump' if +// attr_type=2, attribute 'singlepump' if attr_type=1, +// and no memory attributes otherwise +template +Uint2DArray CreateDictOffset() { + if (attr_type == 1) { + + // The memory attributes apply to the array's declaration + [[intelfpga::singlepump, intelfpga::memory("MLAB"), + intelfpga::numbanks(kVec), intelfpga::max_replicates(kVec)]] + Uint2DArray dict_offset; + + return dict_offset; + + } else if (attr_type == 2) { + + [[intelfpga::doublepump, intelfpga::memory("MLAB"), + intelfpga::numbanks(kVec), intelfpga::max_replicates(kVec)]] + Uint2DArray dict_offset; + + return dict_offset; + } + + return Uint2DArray{}; +} + +template +unsigned RunKernel(unsigned init, const unsigned dict_offset_init[]) { + unsigned result = 0; + +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector device_selector; +#else + intel::fpga_selector device_selector; +#endif + + try { + queue q(device_selector, dpc_common::exception_handler); + + // Flatten the 2D array to a 1D buffer, because the + // buffer constructor requires a pointer to input data + // that is contiguous in memory. + buffer buffer_d(dict_offset_init, + range<1>(kRows * kVec)); + buffer buffer_r(&result, 1); + + auto e = q.submit([&](handler &h) { + auto accessor_d = buffer_d.get_access(h); + auto accessor_r = buffer_r.get_access(h); + + h.single_task>( + [=]() [[intel::kernel_args_restrict]] { + + // Declare 'dict_offset' to be single or double pumped + Uint2DArray dict_offset = CreateDictOffset(); + + // Initialize 'dict_offset' with values from global memory. + for (size_t i = 0; i < kRows; ++i) { + #pragma unroll + for (size_t k = 0; k < kVec; ++k) { + // After unrolling, we end up with kVec writes to dict_offset. + dict_offset[i][k] = accessor_d[i * kVec + k]; + } + } + accessor_r[0] = Compute(init, dict_offset); + }); + }); + + } catch (sycl::exception const &e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + + return result; +} + +// This host side function performs the same computation as the device side +// kernel, and is used to verify functional correctness. +unsigned GoldenRun(unsigned init, unsigned const dict_offset_init[]) { + Uint2DArray dict_offset; + for (size_t i = 0; i < kRows; ++i) { + for (size_t k = 0; k < kVec; ++k) { + dict_offset[i][k] = dict_offset_init[i * kVec + k]; + } + } + return Compute(init, dict_offset); +} + +int main() { + srand(0); + + Uint2DArray dict_offset_init; + + bool passed = true; + + for (size_t j = 0; j < kNumTests; j++) { + unsigned init = rand() % kMaxVal; + unsigned int dict_offset_init[kRows * kVec]; + + // initialize input data with random values + for (size_t i = 0; i < kRows; ++i) { + for (size_t k = 0; k < kVec; ++k) { + dict_offset_init[i * kVec + k] = rand() % kMaxVal; + } + } + + // compute the golden result + unsigned golden_result = GoldenRun(init, dict_offset_init); + + // run the kernel with 'singlepump' memory attribute + unsigned result_sp = RunKernel<1>(init, dict_offset_init); + + if (!(result_sp == golden_result)) { + passed = false; + std::cout << " Test#" << j + << ": mismatch: " << result_sp << " != " << golden_result + << " (result_sp != golden_result)\n"; + } + + // run the kernel with 'doublepump' memory attribute + unsigned result_dp = RunKernel<2>(init, dict_offset_init); + + if (!(result_dp == golden_result)) { + passed = false; + std::cout << " Test#" << j + << ": mismatch: " << result_dp << " != " << golden_result + << " (result_dp != golden_result)\n"; + } + + // run the kernel with no memory attributes + unsigned result_na = RunKernel<0>(init, dict_offset_init); + + if (!(result_na == golden_result)) { + passed = false; + std::cout << " Test#" << j + << ": mismatch: " << result_na << " != " << golden_result + << " (result_na != golden_result)\n"; + } + } + + if (passed) { + std::cout << "PASSED: all kernel results are correct.\n"; + } else { + std::cout << "FAILED\n"; + return 1; + } + + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/CMakeLists.txt new file mode 100755 index 0000000000..63f680d7fd --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(Pipes) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/README.md new file mode 100755 index 0000000000..f168deb09f --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/README.md @@ -0,0 +1,250 @@ +# Data Transfers Using Pipes +This FPGA tutorial shows how to use pipes to transfer data between kernels. + +***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | The basics of the of DPC++ pipes extension for FPGA
How to declare and use pipes in a DPC++ program +| Time to complete | 15 minutes + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + +## Purpose +This tutorial demonstrates how a kernel in a DPC++ FPGA program transfers +data to or from another kernel using the pipe abstraction. + +### Definition of a Pipe +The primary goal of pipes is to allow concurrent execution of kernels that need +to exchange data. + +A pipe is a FIFO data structure connecting two endpoints that communicate +using the pipe's `read` and `write` operations. An endpoint can be either a kernel +or an external I/O on the FPGA. Therefore, there are three types of pipes: +* kernel-kernel +* kernel-I/O +* I/O-kernel + +This tutorial focuses on kernel-kernel pipes, but +the concepts discussed here apply to other kinds of pipes as well. + +The `read` and `write` operations have two variants: +* Blocking variant: Blocking operations may not return immediately, but are always successful. +* Non-blocking variant: Non-blocking operations take an extra boolean parameter +that is set to `true` if the operation happened successfully. + +Data flows in a single direction inside pipes. In other words, for a pipe `P` +and two kernels using `P`, one of the kernels is exclusively going to perform +`write` to `P` while the other kernel is exclusively going to perform `read` from +`P`. Bidirectional communication can be achieved using two pipes. + +Each pipe has a configurable `capacity` parameter describing the number of `write` +operations that may be performed without any `read` operations being performed. For example, +consider a pipe `P` with capacity 3, and two kernels `K1` and `K2` using +`P`. Assume that `K1` performed the following sequence of operations: + + `write(1)`, `write(2)`, `write(3)` + +In this situation, the pipe is full, because three (the `capacity` of +`P`) `write` operations were performed without any `read` operation. In this +situation, a `read` must occur before any other `write` is allowed. + +If a `write` is attempted to a full pipe, one of two behaviors occur: + + * If the operation is non-blocking, it returns immediately and its + boolean parameter is set to `false`. The `write` does not have any effect. + * If the operation is blocking, it does not return until a `read` is + performed by the other endpoint. Once the `read` is performed, the `write` + takes place. + +The blocking and non-blocking `read` operations have analogous behaviors when +the pipe is empty. + +### Defining a Pipe in DPC++ + +In DPC++, pipes are defined as a class with static members. To declare a pipe that +transfers integer data and has `capacity=4`, use a type alias: + +```c++ +using ProducerToConsumerPipe = pipe< // Defined in the DPC++ headers. + class ProducerConsumerPipe, // An identifier for the pipe. + int, // The type of data in the pipe. + 4>; // The capacity of the pipe. +``` + +The `class ProducerToConsumerPipe` template parameter is important to the +uniqueness of the pipe. This class need not be defined, but must be distinct +for each pipe. Consider another type alias with the exact same parameters: + +```c++ +using ProducerToConsumerPipe2 = pipe< // Defined in the DPC++ headers. + class ProducerConsumerPipe, // An identifier for the pipe. + int, // The type of data in the pipe. + 4>; // The capacity of the pipe. +``` + +The uniqueness of a pipe is derived from a combination of all three template +parameters. Since `ProducerToConsumerPipe` and `ProducerToConsumerPipe2` have +the same template parameters, they define the same pipe. + +### Using a Pipe in DPC++ + +This code sample defines a `Consumer` and a `Producer` kernel connected +by the pipe `ProducerToConsumerPipe`. Kernels use the +`ProducerToConsumerPipe::write` and `ProducerToConsumerPipe::read` methods for +communication. + +The `Producer` kernel reads integers from the global memory and writes those integers +into `ProducerToConsumerPipe`, as shown in the following code snippet: + +```c++ +void Producer(queue &q, buffer &input_buffer) { + std::cout << "Enqueuing producer...\n"; + + auto e = q.submit([&](handler &h) { + auto input_accessor = input_buffer.get_access(h); + auto num_elements = input_buffer.get_count(); + + h.single_task([=]() { + for (size_t i = 0; i < num_elements; ++i) { + ProducerToConsumerPipe::write(input_accessor[i]); + } + }); + }); +} +``` + +The `Consumer` kernel reads integers from `ProducerToConsumerPipe`, processes +the integers (`ConsumerWork(i)`), and writes the result into the global memory. + +```c++ +void Consumer(queue &q, buffer &output_buffer) { + std::cout << "Enqueuing consumer...\n"; + + auto e = q.submit([&](handler &h) { + auto output_accessor = output_buffer.get_access(h); + size_t num_elements = output_buffer.get_count(); + + h.single_task([=]() { + for (size_t i = 0; i < num_elements; ++i) { + int input = ProducerToConsumerPipe::read(); + int answer = ConsumerWork(input); + output_accessor[i] = answer; + } + }); + }); +} +``` + +**NOTE:** The `read` and `write` operations used are blocking. If +`ConsumerWork` is an expensive operation, then `Producer` might fill +`ProducerToConsumerPipe` faster than `Consumer` can read from it, causing +`Producer` to block occasionally. + +## Key Concepts +* The basics of the of DPC++ pipes extension for FPGA +* How to declare and use pipes in a DPC++ program + +## License +This code sample is licensed under MIT license. + +## Building the `pipes` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (FPGA) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + +### On a Windows* System +Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. + +1. Enter the source file directory. + ``` + cd src + ``` + +2. Compile the design. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + ninja fpga_emu + ``` + + * Generate the optimization report: + + ``` + ninja report + ``` + If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use: + ``` + ninja report_s10_pac + ``` + * Compiling for FPGA hardware is not yet supported on Windows. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Examining the Reports +Locate `report.html` in the `pipes_report.prj/reports/` or `pipes_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. + +Navigate to the "System Viewer" to visualize the structure of the kernel system. Identify the pipe connecting the two kernels. + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./pipes.fpga_emu (Linux) + pipes.fpga_emu.exe (Windows) + ``` +2. Run the sample on the FPGA device: + ``` + ./pipes.fpga (Linux) + ``` + +### Example of Output +``` +Input Array Size: 1024 +Enqueuing producer... +Enqueuing consumer... +PASSED: The results are correct +``` diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/pipes.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/pipes.sln new file mode 100755 index 0000000000..aa652a2f4b --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/pipes.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "pipes", "pipes.vcxproj", "{BE9E5E70-F644-4119-9A1F-E2B75C85B9E2}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {BE9E5E70-F644-4119-9A1F-E2B75C85B9E2}.Debug|x64.ActiveCfg = Debug|x64 + {BE9E5E70-F644-4119-9A1F-E2B75C85B9E2}.Debug|x64.Build.0 = Debug|x64 + {BE9E5E70-F644-4119-9A1F-E2B75C85B9E2}.Release|x64.ActiveCfg = Release|x64 + {BE9E5E70-F644-4119-9A1F-E2B75C85B9E2}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {47B77939-C7AE-44EC-AD38-EF8459A9C41A} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/pipes.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/pipes.vcxproj new file mode 100755 index 0000000000..7bae18102e --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/pipes.vcxproj @@ -0,0 +1,160 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {be9e5e70-f644-4119-9a1f-e2b75c85b9e2} + Win32Proj + pipes + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)pipes.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)pipes.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/sample.json new file mode 100755 index 0000000000..1c67d49d41 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/sample.json @@ -0,0 +1,51 @@ +{ + "guid": "58CF1ABA-5D08-40B7-ACC2-5CB904261413", + "name": "Data Transfers Using Pipes", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "How to use pipes to transfer data between kernels on an FPGA", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./pipes.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "pipes.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/CMakeLists.txt new file mode 100755 index 0000000000..f8a80a7e68 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/CMakeLists.txt @@ -0,0 +1,89 @@ +set(SOURCE_FILE pipes.cpp) +set(TARGET_NAME pipes) +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS "-fintelfpga") + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR") +set(EMULATOR_LINK_FLAGS "-fintelfpga") + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) + +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS}) + set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) +endif() + +# report + +if(WIN32) + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) + +else() + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/build.ninja new file mode 100755 index 0000000000..a45c4c511c --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/build.ninja @@ -0,0 +1,30 @@ +source_file = pipes.cpp +target_name = pipes + +emulator_target = ${target_name}.fpga_emu.exe +report_target = ${target_name}_report.a +report_target_s10_pac = ${target_name}_s10_pac_report.a + +hardware_flags = -fintelfpga -Xshardware -std=c++14 +emulator_flags = -fintelfpga -DFPGA_EMULATOR -std=c++14 + +rule build_fpga_emu + command = dpcpp /GX ${emulator_flags} $in -o $out + +rule gen_report + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out + +rule gen_report_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu ${source_file} + +# report +build report: phony ${report_target} +build ${report_target}: gen_report ${source_file} + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_s10_pac} +build ${report_target_s10_pac}: gen_report_s10_pac ${source_file} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/pipes.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/pipes.cpp new file mode 100755 index 0000000000..71de729c3c --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/pipes.cpp @@ -0,0 +1,135 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include +#include +#include +#include "dpc_common.hpp" + + +using namespace sycl; + +using ProducerToConsumerPipe = intel::pipe< // Defined in the SYCL headers. + class ProducerConsumerPipe, // An identifier for the pipe. + int, // The type of data in the pipe. + 4>; // The capacity of the pipe. + +// Forward declare the kernel names +// (This will become unnecessary in a future compiler version.) +class ProducerTutorial; +class ConsumerTutorial; + +// The Producer kernel reads data from a SYCL buffer and writes it to +// a pipe. This transfers the input data from the host to the Consumer kernel +// that is running concurrently. +void Producer(queue &q, buffer &input_buffer) { + std::cout << "Enqueuing producer...\n"; + + auto e = q.submit([&](handler &h) { + auto input_accessor = input_buffer.get_access(h); + size_t num_elements = input_buffer.get_count(); + + h.single_task([=]() { + for (size_t i = 0; i < num_elements; ++i) { + ProducerToConsumerPipe::write(input_accessor[i]); + } + }); + }); +} + + +// An example of some simple work, to be done by the Consumer kernel +// on the input data +int ConsumerWork(int i) { return i * i; } + +// The Consumer kernel reads data from the pipe, performs some work +// on the data, and writes the results to an output buffer +void Consumer(queue &q, buffer &out_buf) { + std::cout << "Enqueuing consumer...\n"; + + auto e = q.submit([&](handler &h) { + auto out_accessor = out_buf.get_access(h); + size_t num_elements = out_buf.get_count(); + + h.single_task([=]() { + for (size_t i = 0; i < num_elements; ++i) { + int input = ProducerToConsumerPipe::read(); + int answer = ConsumerWork(input); + out_accessor[i] = answer; + } + }); + }); +} + +int main(int argc, char *argv[]) { + size_t array_size = (1 << 10); + + if (argc > 1) { + std::string option(argv[1]); + if (option == "-h" || option == "--help") { + std::cout << "Usage: \n \n\nFAILED\n"; + return 1; + } else { + array_size = std::stoi(option); + } + } + + std::cout << "Input Array Size: " << array_size << "\n"; + + std::vector producer_input(array_size, -1); + std::vector consumer_output(array_size, -1); + + // Initialize the input data + for (size_t i = 0; i < array_size; i++) + producer_input[i] = i; + +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector device_selector; +#else + intel::fpga_selector device_selector; +#endif + + try { + queue q(device_selector, dpc_common::exception_handler); + + buffer producer_buffer(producer_input); + // Use verbose SYCL 1.2 syntax for the output buffer. + // (This will become unnecessary in a future compiler version.) + buffer consumer_buffer(consumer_output.data(), array_size); + + // Run the two kernels concurrently. The Producer kernel sends + // data via a pipe to the Consumer kernel. + Producer(q, producer_buffer); + Consumer(q, consumer_buffer); + + } catch (sycl::exception const &e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + + // Verify result + for (size_t i = 0; i < array_size; i++) { + if (consumer_output[i] != ConsumerWork(producer_input[i])) { + std::cout << "input = " << producer_input[i] + << " expected: " << ConsumerWork(producer_input[i]) + << " got: " << consumer_output[i] << "\n"; + std::cout << "FAILED: The results are incorrect\n"; + return 1; + } + } + std::cout << "PASSED: The results are correct\n"; + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/CMakeLists.txt new file mode 100755 index 0000000000..c18e7e73ed --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(SpeculatedIterations) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/README.md new file mode 100755 index 0000000000..bd1d9359bf --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/README.md @@ -0,0 +1,174 @@ + +# Speculated Iterations of a Loop +This FPGA tutorial demonstrates applying the `speculated_iterations` attribute to a loop in a task kernel to enable more efficient loop pipelining. + +***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | What the `speculated_iterations` attribute does
How to apply the `speculated_iterations` attribute to loops in your program
How to determine the optimal number of speculated iterations +| Time to complete | 15 minutes + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + +## Purpose +Loop speculation is an advanced loop pipelining optimization technique. It enables loop iterations to be initiated before determining whether they should have been initiated. "Speculated iterations" are those iterations that launch before the exit condition computation has completed. This is beneficial when the computation of the exit condition is preventing effective loop pipelining. + +The `speculated_iterations` attribute is a loop attribute that enables you to directly control the number of speculated iterations for a loop. The attribute `[[intelfpga::speculated_iterations(N)]]` takes an integer argument `N` to specify the permissible number of iterations to speculate. + +### Simple example +``` + [[intelfpga::speculated_iterations(1)]] + while (sycl::log10(x) < N) { + x += 1; + } + dst[0] = x; +``` +The loop in this example will have one speculated iteration. +### Operations with side effects +When launching speculated iterations, operations with side-effects (such as stores to memory) must be predicated by the exit condition to ensure functional correctness. For this reason, operations with side-effects must be scheduled until after the exit condition has been computed. + +### Optimizing the number of speculated iterations +Loop speculation is beneficial when the computation of the loop exit condition is the bottleneck preventing the compiler from achieving a smaller initiation interval (II). In such instances, increasing the number of speculated iterations often improves the II. Note that this may also uncover additional bottlenecks preventing the further optimization of the loop. + +However, adding speculated iterations is not without cost. They introduce overhead in nested loops, reducing overall loop occupancy. Consider the code snippet below: +```c++ +for (size_t i = 0; i < kMany; ++i) { + // The compiler may automatically infer speculated iterations + for (size_t j = 0; complex_exit_condition(j); ++j) { + output[i,j] = some_function(input[i,j]); + } +} +``` +The *i+1*th invocation of the inner loop cannot begin until all real and speculated iterations of its *i*th invocation have completed. This overhead is negligible if the number of speculated iterations is much less than the number of real iterations. However, when the inner loop's trip count is small on average, the overhead becomes non-negligible and the speculated iterations can become detrimental to throughput. In such circumstances, the `speculated_iterations` attribute can be used to *reduce* the number of speculated iterations chosen by the compiler's heuristics. + +In both increasing and decreasing cases, some experimentation is usually necessary. Choosing too new speculated iterations can increase the II because multiple cycles are required to evaluate the exit condition. Choosing too many speculated iterations creates unneeded "dead space" between sequential invocations of an inner loop. + +### Tutorial example +In the tutorial design's kernel, the exit condition of the loop involves a logarithm and a compare operation. This complex exit condition prevents the loop from achieving ```II=1```. + +The design enqueues variants of the kernel with 0, 10 and 27 speculated iterations respectively to demonstrate the effect of the `speculated_iterations` attribute on the Intel® PAC with Intel Arria® 10 GX FPGA. Different numbers are chosen for the Intel® PAC with Intel Stratix® 10 SX FPGA accordingly. + +## Key Concepts +* Description of the `speculated_iterations` attribute. +* How to apply the `speculated_iterations` attribute to loops in your program. +* Optimizing the number of speculated iterations. + +## License +This code sample is licensed under MIT license. + + +## Building the `speculated_iterations` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + +### On a Windows* System +Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. + +1. Enter the source file directory. + ``` + cd src + ``` + +2. Compile the design. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + ninja fpga_emu + ``` + + * Generate the optimization report: + + ``` + ninja report + ``` + If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use: + ``` + ninja report_s10_pac + ``` + * Compiling for FPGA hardware is not yet supported on Windows. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + +## Examining the Reports +Locate `report.html` in the `speculated_iterations_report.prj/reports/` or `speculated_iterations_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. + +In the "Loop Analysis" section of the report, check the II of the loop in each version of the kernel. Use the kernel with 0 speculated iteration as a base version, check its loop II as a hint for the ideal number for speculated iterations. The information shown below is from compiling on the Intel® PAC with Intel Arria® 10 GX FPGA. + +* When the number of `speculated iterations` is set to 0, the loop II is 27. +* Setting the `speculated iterations` to 27 yielded an II of 1. +* Setting the `speculated iterations` to an intermediate value of 10 results in an II of 3. + + +These results make sense when you recall that the loop exit computation has a latency of 27 cycles (suggested by looking at the loop II with 0 speculation). With no speculation, a new iteration can only be launched every 27 cycles. Increasing the speculation to 27 enables a new iteration to launch every cycle. Reducing the speculation to 10 results in an II of 3 because 10 speculated iteration multipled by 3 cycles between iterations leaves 30 cycles in which to compute the exit condition, sufficient to cover the 27-cycle exit condition. + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./speculated iterations.fpga_emu (Linux) + speculated iterations.fpga_emu.exe (Windows) + ``` +2. Run the sample on the FPGA device: + ``` + ./speculated iterations.fpga (Linux) + ``` + +### Example of Output +``` +Speculated Iterations: 0 -- kernel time: 8564.98 ms +Performance for kernel with 0 speculated iterations: 11675 MFLOPs +Speculated Iterations: 10 -- kernel time: 952 ms +Performance for kernel with 10 speculated iterations: 105076 MFLOPs +Speculated Iterations: 27 -- kernel time: 317 ms +Performance for kernel with 27 speculated iterations: 315181 MFLOPs +PASSED: The results are correct +``` +The execution time and throughput for each kernel is displayed. + +Note that this performance difference will be apparent only when running on FPGA hardware. The emulator, while useful for verifying functionality, will generally not reflect differences in performance. + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/sample.json new file mode 100755 index 0000000000..28f98e4a48 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/sample.json @@ -0,0 +1,51 @@ +{ + "guid": "66A57127-1F8D-4769-8CCB-16ECC56A446F", + "name": "Speculated Iterations of a Loop", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "FPGA tutorial demonstrating the speculated_iterations attribute", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./speculated_iterations.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "speculated_iterations.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/speculated_iterations.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/speculated_iterations.sln new file mode 100755 index 0000000000..7155665db9 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/speculated_iterations.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "speculated_iterations", "speculated_iterations.vcxproj", "{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Debug|x64.ActiveCfg = Debug|x64 + {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Debug|x64.Build.0 = Debug|x64 + {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Release|x64.ActiveCfg = Release|x64 + {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {92BEFAAB-0365-4E5A-9C4A-E50AB49B2A6B} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/speculated_iterations.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/speculated_iterations.vcxproj new file mode 100755 index 0000000000..7a23ad883f --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/speculated_iterations.vcxproj @@ -0,0 +1,161 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + 15.0 + {cf6a576b-665d-4f24-bb62-0dae7a7b3c64} + Win32Proj + speculated_iterations + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR -DA10 %(AdditionalOptions) + $(IntDir)speculated_iterations.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR -DA10 %(AdditionalOptions) + $(IntDir)speculated_iterations.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/CMakeLists.txt new file mode 100755 index 0000000000..5140f431a0 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/CMakeLists.txt @@ -0,0 +1,97 @@ +set(SOURCE_FILE speculated_iterations.cpp) +set(TARGET_NAME speculated_iterations) + +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +# This tutorial needs to know which FPGA we are targetting to decide how many speculated_iterations to use +IF (_FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + SET(FPGA_BOARD_MACRO "-DA10") +ELSEIF(_FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + SET(FPGA_BOARD_MACRO "-DS10") +ELSE() + MESSAGE(FATAL_ERROR "Unknown board!") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS "-fintelfpga ${FPGA_BOARD_MACRO}") + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${FPGA_BOARD_MACRO} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR ${FPGA_BOARD_MACRO}") +set(EMULATOR_LINK_FLAGS "-fintelfpga") + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS}) + set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) +endif() + +# generate report +if(WIN32) + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) + +else() + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) \ No newline at end of file diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/build.ninja new file mode 100755 index 0000000000..e8c5f7f77e --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/build.ninja @@ -0,0 +1,32 @@ +source_file = speculated_iterations.cpp +target_name = speculated_iterations + +emulator_target = ${target_name}.fpga_emu.exe +report_target = ${target_name}_report.a +report_target_s10_pac = ${target_name}_s10_pac_report.a + +hardware_flags = -fintelfpga -Xshardware +emulator_flags = -fintelfpga -DFPGA_EMULATOR + +rule build_fpga_emu + # For the emulator, it makes no difference whether this sample is compiled with -DA10 or -DS10 + command = dpcpp /GX ${emulator_flags} -DA10 $in -o $out + +rule gen_report + command = dpcpp /GX ${hardware_flags} -DA10 -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out + +rule gen_report_s10_pac + command = dpcpp /GX ${hardware_flags} -DS10 -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu ${source_file} + +# report +build report: phony ${report_target} +build ${report_target}: gen_report ${source_file} + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_s10_pac} +build ${report_target_s10_pac}: gen_report_s10_pac ${source_file} + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/speculated_iterations.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/speculated_iterations.cpp new file mode 100755 index 0000000000..f689a6eb03 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/speculated_iterations.cpp @@ -0,0 +1,150 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include +#include +#include +#include +#include "dpc_common.hpp" + +// Use smaller values if run on the emulator to keep the CPU runtime reasonable +// Use the largest possible int values on the FPGA to show the difference in +// performance with and without speculated_iterations +#if defined(FPGA_EMULATOR) +constexpr float kUpper = 3.0f; +constexpr size_t kExpectedIterations = 1e3; +#else +constexpr float kUpper = 8.0f; +constexpr size_t kExpectedIterations = 1e8; +#endif + +using namespace sycl; + +// This is the class used to name the kernel for the runtime. +// This must be done when the kernel is expressed as a lambda. +template class KernelCompute; + +template +void ComplexExit(const device_selector &selector, float bound, int &res) { + double kernel_time_ms = 0.0; + try { + // create the device queue with profiling enabled + auto prop_list = property_list{property::queue::enable_profiling()}; + queue q(selector, dpc_common::exception_handler, prop_list); + + // The scalar inputs are passed to the kernel using the lambda capture, + // but a SYCL buffer must be used to return a scalar from the kernel. + buffer buffer_res(&res, 1); + + event e = q.submit([&](handler &h) { + auto accessor_res = buffer_res.get_access(h); + + h.single_task>([=]() { + int x = 1; + + // Computing the exit condition of this loop is a complex operation. + // Since the value of var is not known at compile time, the loop + // trip count is variable and the exit condition must be evaluated at + // each iteration. + [[intelfpga::speculated_iterations(spec_iter)]] + while (sycl::log10((float)(x)) < bound) { + x++; + } + + accessor_res[0] = x; + }); + }); + + // get the kernel time in milliseconds + // this excludes memory transfer and queuing overhead + double startk = + e.template get_profiling_info(); + double endk = + e.template get_profiling_info(); + kernel_time_ms = (endk - startk) * 1e-6; + + } catch (exception const &exc) { + std::cout << "Caught synchronous SYCL exception:\n" << exc.what() << "\n"; + if (exc.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + + // MFLOPs = mega floating point operations per second + double mflops = (double)(kExpectedIterations) / kernel_time_ms; + + std::cout << "Speculated Iterations: " << spec_iter + << " -- kernel time: " << kernel_time_ms << " ms\n"; + + std::cout << std::fixed << std::setprecision(0) + << "Performance for kernel with " << spec_iter + << " speculated iterations: " << mflops << " MFLOPs\n"; +} + +int main(int argc, char *argv[]) { +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector selector; +#else + intel::fpga_selector selector; +#endif + + float bound = kUpper; + + // We don't want "bound" to be a compile-time known constant value + if (argc > 1) { + std::string option(argv[1]); + bound = std::stoi(option); + } + + // result variables + int r0, r1, r2; + +// Choose the number of speculated iterations based on the FPGA board selected. +// This reflects compute latency differences on different hardware architectures, +// and is a low-level optimization. +#if defined(A10) + ComplexExit<0>(selector, bound, r0); + ComplexExit<10>(selector, bound, r1); + ComplexExit<27>(selector, bound, r2); +#elif defined(S10) + ComplexExit<0>(selector, bound, r0); + ComplexExit<10>(selector, bound, r1); + ComplexExit<54>(selector, bound, r2); +#else + std::static_assert(false, "Invalid FPGA board macro"); +#endif + + bool passed = true; + + if (std::fabs(std::log10(r0) - bound) > 1e-5) { + std::cout << "Test 0 result mismatch " << std::log10(r0) + << " not within 0.00001 of " << bound << "\n"; + passed = false; + } + + if (std::fabs(std::log10(r1) - bound) > 1e-5) { + std::cout << "Test 1 result mismatch " << std::log10(r1) + << " not within 0.00001 of " << bound << "\n"; + passed = false; + } + + if (std::fabs(std::log10(r2) - bound) > 1e-5) { + std::cout << "Test 2 result mismatch " << std::log10(r2) + << " not within 0.00001 of " << bound << "\n"; + passed = false; + } + + + std::cout << (passed ? "PASSED: The results are correct" : "FAILED") << "\n"; + + return passed ? 0 : -1; +} + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/CMakeLists.txt new file mode 100755 index 0000000000..ec7f83f6b3 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(DeviceLink) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/README.md b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/README.md new file mode 100755 index 0000000000..e2991414b9 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/README.md @@ -0,0 +1,203 @@ + +# Separating Host and Device Code Compilation +This FPGA tutorial demonstrates how to separate the compilation of a program's host code and device code to save development time. + +***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | Why to separate host and device code compilation in your FPGA project
How to use the `-reuse-exe` and device link methods
Which method to choose for your project +| Time to complete | 15 minutes + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + +## Purpose +Intel® oneAPI DPC++ Compiler (Beta) only supports ahead-of-time (AoT) compilation for FPGA, which means that an FPGA device image is generated at compile time. The FPGA device image generation process can take hours to complete. If you make a change that is exclusive to the host code, it is more efficient to recompile your host code only, re-using the existing FPGA device image and circumventing the time-consuming device compilation process. + +The Intel® oneAPI DPC++ Compiler (Beta) provides two different mechanisms to separate device code and host code compilation. +* Passing `-reuse-exe=` flag to `dpcpp` instructs the compiler to attempt to reuse the existing FPGA device image. +* The more explicit "device link" method requires you to separate the host and device code into separate files. When a code change only applies to host-only files, an FPGA device image is not regenerated. + +This tutorial explains both mechanisms and the pros and cons of each. The included code sample demonstrates the device link method. + +### Using the `-reuse-exe` flag + +If the device code and options affecting the device have not changed since the previous compilation, passing the `-reuse-exe=` flag to `dpcpp` instructs the compiler to extract the compiled FPGA binary from the existing executable and package it into the new executable, saving the device compilation time. + +**Sample usage:** + +``` +# Initial compilation +dpcpp -o out.fpga -Xshardware -fintelfpga +``` +The initial compilation generates an FPGA device image, which takes several hours. Now, make some changes to the host code. +``` +# Subsequent recompilation +dpcpp -o out.fpga -reuse-exe=out.fpga -Xshardware -fintelfpga +``` +If `out.fpga` does not exist, `-reuse-exe` is ignored and the FPGA device image is regenerated. This will always be the case the first time a project is compiled. + +If `out.fpga` is found, the compiler verifies that no changes that affect the FPGA device code have been made since the last compilation. If so, the compiler reuses the existing FPGA binary and only the host code is recompiled. The recompilation process takes a few minutes. Note that the device code is *partially* re-compiled (the equivalent of a report flow compile) in order to check that the FPGA binary can safely be reused. + +### Using the device link method + +The program accompanying this tutorial is separated into two files, `main.cpp` and `kernel.cpp`. Only the `kernel.cpp` file contains device code. + +In the normal compilation process, FPGA device image generation happens at link time. As a result, any change to either `main.cpp` or `kernel.cpp` will trigger the regeneration of an FPGA device image. + +``` +# normal compile command +dpcpp -fintelfpga main.cpp kernel.cpp -Xshardware -o link.fpga +``` + +The following graph depicts this compilation process: + +![](normal_compile.png) + + +If you want to iterate on the host code and avoid long compile time for your FPGA device, consider using a device link to separate device and host compilation: + +``` +# device link command +dpcpp -fintelfpga -fsycl-link=image [options] +``` + +The compilation is a 3-step process: + +1. Compile the device code: + + ``` + dpcpp -fintelfpga -fsycl-link=image kernel.cpp -o dev_image.a -Xshardware + ``` + Input files should include all source files that contain device code. This step may take several hours. + + +2. Compile the host code: + + ``` + dpcpp -fintelfpga main.cpp -c -o host.o + ``` + Input files should include all source files that only contain host code. This takes seconds. + + +3. Create the device link: + + ``` + dpcpp -fintelfpga host.o dev_image.a -o fast_recompile.fpga + ``` + The input should have N (N >= 0) host object files *(.o)* and one device image file *(.a)*. This takes seconds. + +**NOTE:** You only need to perform steps 2 and 3 when modifying host-only files. + +The following graph depicts device link compilation process: + +![](fast_recompile.png) + +### Which method to use? +Of the two methods described, `-reuse-exe` is easier to use. It also allows you to keep your host and device code as single source, which is preferred for small programs. + +For larger and more complex projects, the device link method has the advantage of giving you complete control over the compiler's behavior. +* When using `-reuse-exe`, the compiler must spend time partially recompiling and then analyzing the device code to ensure that it is unchanged. This takes several minutes for larger designs. Compiling separate files does not incur this time. +* When using `-reuse-exe`, you may occasionally encounter a "false positive" where the compiler wrongly believes that it must recompile your device code. In a single source file, the device and host code are coupled, so certain changes to the host code can change the compiler's view of the device code. The compiler will always behave conservatively and trigger a full recompilation if it cannot prove that reusing the previous FPGA binary is safe. Compiling separate files eliminates this possibility. + + +## Key Concepts +* Why to separate host and device code compilation in your FPGA project +* How to use the `-reuse-exe` and device link methods +* Which method to choose for your project + +## License +This code sample is licensed under MIT license. + + +## Building the `fast_recompile` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + **NOTE:** For the FPGA emulator target and the FPGA target, the device link method is used. +2. Compile the design through the generated `Makefile`. The following build targets are provided: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + +### On a Windows* System +Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. + +1. Enter the source file directory. + ``` + cd src + ``` + +2. Compile the design. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + ninja fpga_emu + ``` + **NOTE:** For the FPGA emulator target, the device link method is used. + * Generate the optimization report: + + ``` + ninja report + ``` + If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use: + ``` + ninja report_s10_pac + ``` + * Compiling for FPGA hardware is not yet supported on Windows. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./fast_recompile.fpga_emu (Linux) + fast_recompile.fpga_emu.exe (Windows) + ``` +2. Run the sample on the FPGA device: + ``` + ./fast_recompile.fpga (Linux) + ``` + +### Example of Output +``` +PASSED: results are correct +``` +### Discussion of Results +Try modifying `main.cpp` to produce a different output message. Then, perform a host-only recompile via the device link method to see how quickly the design is recompiled. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/device_link.png b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/device_link.png new file mode 100755 index 0000000000..18619231fa Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/device_link.png differ diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/fast_recompile.sln b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/fast_recompile.sln new file mode 100755 index 0000000000..cf3fe19782 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/fast_recompile.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "fast_recompile", "fast_recompile.vcxproj", "{AD7020EE-30BB-496A-801E-A17F67699F38}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {AD7020EE-30BB-496A-801E-A17F67699F38}.Debug|x64.ActiveCfg = Debug|x64 + {AD7020EE-30BB-496A-801E-A17F67699F38}.Debug|x64.Build.0 = Debug|x64 + {AD7020EE-30BB-496A-801E-A17F67699F38}.Release|x64.ActiveCfg = Release|x64 + {AD7020EE-30BB-496A-801E-A17F67699F38}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {73223E7A-81B2-40C4-8A0C-19D0021CFD05} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/fast_recompile.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/fast_recompile.vcxproj new file mode 100755 index 0000000000..a01f63e62c --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/fast_recompile.vcxproj @@ -0,0 +1,166 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {ad7020ee-30bb-496a-801e-a17f67699f38} + Win32Proj + fast_recompile + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + + + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + + + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + + + + + + + + + + + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/normal_compile.png b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/normal_compile.png new file mode 100755 index 0000000000..4903c6f371 Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/normal_compile.png differ diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/sample.json new file mode 100755 index 0000000000..5f703d1e6c --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/sample.json @@ -0,0 +1,35 @@ +{ + "guid": "1457B49A-2CD3-48E5-B3A9-753EAD2D18F7", + "name": "Separating Host and Device Code Compilation", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "FPGA tutorial demonstrating how to separate the compilation of host and device code to save development time.", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./fast_recompile.fpga_emu" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "fast_recompile.fpga_emu.exe" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/CMakeLists.txt new file mode 100755 index 0000000000..1bf5ca6de7 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/CMakeLists.txt @@ -0,0 +1,119 @@ +set(DEVICE_SOURCE_FILE kernel.cpp) +set(KERNEL_HEADER_FILE kernel.hpp) +set(HOST_SOURCE_FILE main.cpp) +set(TARGET_NAME fast_recompile) + +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +String(STRIP "${CMAKE_EXE_LINKER_FLAGS}" CMAKE_EXE_LINKER_FLAGS) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS -fintelfpga -c) + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS -fintelfpga -DFPGA_EMULATOR -c) +set(EMULATOR_LINK_FLAGS -fintelfpga) + +# fpga emulator +if(WIN32) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set(HOST_EMU_OBJ "host_emu.o") + set(DEVICE_EMU_OBJ "dev_emu.o") + set(DEVICE_IMAGE_EMU_OBJ "dev_image_emu.a") + + add_custom_command(OUTPUT ${HOST_EMU_OBJ} + COMMAND dpcpp ${EMULATOR_COMPILE_FLAGS} + ${CMAKE_CURRENT_SOURCE_DIR}/${HOST_SOURCE_FILE} -o ${HOST_EMU_OBJ} + DEPENDS ${HOST_SOURCE_FILE} ${KERNEL_HEADER_FILE}) + + add_custom_command(OUTPUT ${DEVICE_EMU_OBJ} + COMMAND dpcpp ${EMULATOR_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${DEVICE_EMU_OBJ} + DEPENDS ${DEVICE_SOURCE_FILE} ${KERNEL_HEADER_FILE}) + + add_custom_command(OUTPUT ${DEVICE_IMAGE_EMU_OBJ} + COMMAND dpcpp ${EMULATOR_LINK_FLAGS} -fsycl-link=image ${DEVICE_EMU_OBJ} -o ${DEVICE_IMAGE_EMU_OBJ} + DEPENDS ${DEVICE_EMU_OBJ}) + + add_custom_command(OUTPUT ${EMULATOR_TARGET} + COMMAND dpcpp -fintelfpga ${HOST_EMU_OBJ} ${DEVICE_IMAGE_EMU_OBJ} -o ${CMAKE_BINARY_DIR}/${EMULATOR_TARGET} + DEPENDS ${HOST_EMU_OBJ} ${DEVICE_IMAGE_EMU_OBJ}) +else() + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set(HOST_EMU_OBJ "host_emu.o") + set(DEVICE_EMU_OBJ "dev_emu.o") + set(DEVICE_IMAGE_EMU_OBJ "dev_image_emu.a") + + add_custom_command(OUTPUT ${HOST_EMU_OBJ} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${EMULATOR_COMPILE_FLAGS} + ${CMAKE_CURRENT_SOURCE_DIR}/${HOST_SOURCE_FILE} -o ${HOST_EMU_OBJ} + DEPENDS ${HOST_SOURCE_FILE} ${KERNEL_HEADER_FILE}) + + add_custom_command(OUTPUT ${DEVICE_EMU_OBJ} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${EMULATOR_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${DEVICE_EMU_OBJ} + DEPENDS ${DEVICE_SOURCE_FILE} ${KERNEL_HEADER_FILE}) + + add_custom_command(OUTPUT ${DEVICE_IMAGE_EMU_OBJ} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${EMULATOR_LINK_FLAGS} -fsycl-link=image ${DEVICE_EMU_OBJ} -o ${DEVICE_IMAGE_EMU_OBJ} + DEPENDS ${DEVICE_EMU_OBJ}) + + add_custom_command(OUTPUT ${EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} -fintelfpga ${HOST_EMU_OBJ} ${DEVICE_IMAGE_EMU_OBJ} -o ${CMAKE_BINARY_DIR}/${EMULATOR_TARGET} + DEPENDS ${HOST_EMU_OBJ} ${DEVICE_IMAGE_EMU_OBJ}) +endif() + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + set(HOST_OBJ "host.o") + set(DEVICE_OBJ "dev.o") + set(DEVICE_IMAGE_OBJ "dev_image.a") + + add_custom_command(OUTPUT ${HOST_OBJ} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${HOST_SOURCE_FILE} -o ${HOST_OBJ} + DEPENDS ${HOST_SOURCE_FILE} ${KERNEL_HEADER_FILE}) + + add_custom_command(OUTPUT ${DEVICE_OBJ} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${DEVICE_OBJ} + DEPENDS ${DEVICE_SOURCE_FILE} ${KERNEL_HEADER_FILE}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_IMAGE_OBJ} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link=image ${DEVICE_OBJ} -o ${DEVICE_IMAGE_OBJ} + DEPENDS ${DEVICE_OBJ}) + + add_custom_command(OUTPUT ${FPGA_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${CMAKE_EXE_LINKER_FLAGS} -fintelfpga ${HOST_OBJ} ${DEVICE_IMAGE_OBJ} -o ${CMAKE_BINARY_DIR}/${FPGA_TARGET} + DEPENDS ${HOST_OBJ} ${DEVICE_IMAGE_OBJ}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/build.ninja new file mode 100755 index 0000000000..ef5b645c71 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/build.ninja @@ -0,0 +1,32 @@ +device_source_file = kernel.cpp +host_source_file = main.cpp +target_name = fast_recompile + +emulator_target = ${target_name}.fpga_emu.exe + +hardware_flags = -fintelfpga -Xshardware +emulator_flags = -fintelfpga -DFPGA_EMULATOR + +rule parse_emu + command = dpcpp -c /EHcs ${emulator_flags} ${in} /Fo${out} + +rule gen_image_obj + command = dpcpp -fintelfpga -fsycl-link=image ${in} -o ${out} + +rule link + command = dpcpp -fintelfpga ${in} -o ${out} + +# FPGA emulator +build fpga_emu: phony ${emulator_target} + +host_emu_obj = host_emu.obj +dev_emu_obj = dev_emu.obj +dev_image_emu_obj = dev_image_emu.a + +build ${host_emu_obj}: parse_emu ${host_source_file} + +build ${dev_emu_obj}: parse_emu ${device_source_file} + +build ${dev_image_emu_obj}: gen_image_obj ${dev_emu_obj} + +build ${emulator_target}: link ${host_emu_obj} ${dev_image_emu_obj} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/kernel.cpp b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/kernel.cpp new file mode 100755 index 0000000000..680da15c67 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/kernel.cpp @@ -0,0 +1,70 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include "dpc_common.hpp" + +#include "kernel.hpp" + +// Forward declaration of the kernel name +// (This will become unnecessary in a future compiler version.) +class VectorAdd; + +void RunKernel(std::vector &vec_a, std::vector &vec_b, + std::vector &vec_r) { + + // Select either the FPGA emulator or FPGA device +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector device_selector; +#else + intel::fpga_selector device_selector; +#endif + + try { + + // Create a queue bound to the chosen device. + // If the device is unavailable, a SYCL runtime exception is thrown. + queue q(device_selector, dpc_common::exception_handler); + + // Print out the device information. + std::cout << "Running on device: " + << q.get_device().get_info() << "\n"; + + // Device buffers + buffer device_a(vec_a); + buffer device_b(vec_b); + // Use verbose SYCL 1.2 syntax for the output buffer. + // (This will become unnecessary in a future compiler version.) + buffer device_r(vec_r.data(), kArraySize); + + q.submit([&](handler &h) { + // Data accessors + auto a = device_a.get_access(h); + auto b = device_b.get_access(h); + auto r = device_r.get_access(h); + + // Kernel executes with pipeline parallelism on the FPGA. + // Use kernel_args_restrict to specify that a, b, and r do not alias. + h.single_task([=]() [[intel::kernel_args_restrict]] { + for (size_t i = 0; i < kArraySize; ++i) { + r[i] = a[i] + b[i]; + } + }); + }); + + } catch (sycl::exception const &e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/kernel.hpp b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/kernel.hpp new file mode 100755 index 0000000000..b36fdb9be1 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/kernel.hpp @@ -0,0 +1,16 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +using namespace sycl; + +// tolerance used in floating point comparisons +constexpr float kTol = 0.001; + +// array size of vectors a, b and c +constexpr size_t kArraySize = 32; + +void RunKernel(std::vector &vec_a, std::vector &vec_b, + std::vector &vec_r); diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/main.cpp b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/main.cpp new file mode 100755 index 0000000000..2d001961d1 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/main.cpp @@ -0,0 +1,48 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#include +#include +#include +#include + +#include "kernel.hpp" + +using namespace sycl; + +int main() { + std::vector vec_a(kArraySize); + std::vector vec_b(kArraySize); + std::vector vec_r(kArraySize); + + // Fill vectors a and b with random float values + for (size_t i = 0; i < kArraySize; i++) { + vec_a[i] = rand() / (float)RAND_MAX; + vec_b[i] = rand() / (float)RAND_MAX; + } + + // The definition of this function is in a different compilation unit, + // so host and device code can be separately compiled. + RunKernel(vec_a, vec_b, vec_r); + + // Test the results + size_t correct = 0; + for (size_t i = 0; i < kArraySize; i++) { + float tmp = vec_a[i] + vec_b[i] - vec_r[i]; + if (tmp * tmp < kTol * kTol) { + correct++; + } + } + + // Summarize results + if (correct == kArraySize) { + std::cout << "PASSED: results are correct\n"; + } else { + std::cout << "FAILED: results are incorrect\n"; + } + + return !(correct == kArraySize); +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/CMakeLists.txt new file mode 100755 index 0000000000..0ac5b4f877 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/CMakeLists.txt @@ -0,0 +1,13 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + + +cmake_minimum_required (VERSION 2.8) + +project(CompileFlow) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/README.md b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/README.md new file mode 100755 index 0000000000..2ddfd32e7a --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/README.md @@ -0,0 +1,193 @@ +# Compiling DPC++ for FPGA +This FPGA tutorial introduces how to compile DPC++ for FPGA through a simple vector addition example. + +***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04; Windows* 10 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | How and why compiling DPC++ to FPGA differs from CPU or GPU
FPGA device image types and when to use them
The compile flags used to target FPGA +| Time to complete | 15 minutes + +_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_ + +## Purpose +Field-programmable gate arrays (FPGAs) are configurable integrated circuits that can be programmed to implement arbitrary circuit topologies. Classified as *spatial* compute architectures, FPGAs differ significantly from fixed Instruction Set Architecture (ISA) devices like CPUs and GPUs, and offer a different set of optimization trade-offs from these traditional accelerator devices. + +While DPC++ can be compiled for CPU, GPU or for FPGA, the process for compiling to FPGA is somewhat different than for CPU or GPU. This tutorial motivates these differences and explains how to compile a "Hello World"-style vector addition kernel for FPGA. + +### Why is FPGA compilation different? +FPGAs differ from CPUs and GPUs in many interesting ways. However, in the scope of this tutorial, there is only one difference that matters: compared to CPU or GPU, generating a device image for FPGA hardware is a computationally intensive and time-consuming process. It is normal for an FPGA compile to take several hours to complete. + +For this reason, only ahead-of-time (or "offline") kernel compilation mode is supported for FPGA. The long compile time for FPGA hardware makes just-in-time (or "online") compilation impractical. + +Long compile times are detrimental to developer productivity. The Intel® oneAPI DPC++ Compiler provides several mechanisms that enable DPC++ developers targeting FPGA to iterate quickly on their designs. By circumventing the time-consuming process of full FPGA compilation wherever possible, DPC++ FPGA developers can enjoy the fast compile times familiar to CPU and GPU developers. + + +### Three types of DPC++ FPGA compilation +The three types of FPGA compilation are summarized in the table below. + +| Device Image Type | Time to Compile | Description +--- |--- |--- +| FPGA Emulator | seconds | The FPGA device code is compiled to the CPU.
This is used to verify the code's functional correctness. +| Optimization Report | minutes | The FPGA device code is partially compiled for hardware.
The compiler generates an optimization report that describes the structures generated on the FPGA, identifies performance bottlenecks, and estimates resource utilization. +| FPGA Hardware | hours | Generates the real FPGA bitstream to execute on the target FPGA platform + +The typical FPGA DPC++ development workflow is to iterate in each of these stages, refining the code using the feedback provided by that stage. Intel® recommends relying on emulation and the optimization report whenever possible. + +Compiling for FPGA emulation or to generate the FPGA optimization report requires only the Intel® oneAPI DPC++ Compiler (part of the Intel® oneAPI Base Toolkit). An FPGA hardware compile requires the Intel® FPGA Add-On for oneAPI Base Toolkit. + + +#### FPGA Emulator + +The FPGA emulator is the fastest method to verify the correctness of your code. The FPGA emulator executes DPC++ device code on the CPU. The emulator is similar to the SYCL* host device, but unlike the host device the FPGA emulator device supports FPGA extensions such as FPGA pipes and `fpga_reg`. + +There are two important caveats to remember when using the FPGA emulator. +* **Performance is not representative.** It is not meaningful to evaluate performance on the FPGA emulator, as it is not representative of the behavior of the FPGA device. For example, an optimization that yields a 100x performance improvement on the FPGA may show no impact on the emulator performance, or it may show an unrelated increase or decrease. +* **Undefined behavior may differ.** If your code produces different results when compiled for the FPGA emulator versus FPGA hardware, it is likely that your code is exercising undefined behavior. By definition, undefined behavior is not specified by the language specification, and may manifest differently on different targets. + +#### Optimization Report +An full FPGA compilation occurs in two stages: +1. **FPGA early image:** The DPC++ device code is optimized and converted into an FPGA design specified in Verilog RTL (a low-level, native entry language for FPGAs). This intermediate compilation result is the FPGA early device image, which is *not* executable. This FPGA early image compilation process takes minutes. +2. **FPGA hardware image:** The Verilog RTL specifying the design's circuit topology is mapped onto the FPGA's sea of primitive hardware resources by the Intel® Quartus® Prime software. Intel® Quartus® Prime is included in the Intel® FPGA Add-On, which is required for this compilation stage. The result is an FPGA hardware binary (also referred to as a bitstream). This compilation process takes hours. + +Optimization reports are generated after both stages. The optimization report generated after the FPGA early device image, sometimes called the "static report", contains significant information about how the compiler has transformed your DPC++ device code into an FPGA design. The report contains visualizations of structures generated on the FPGA, performance and expected performance bottleneck information, and estimated resource utilization. + +The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide/top/analyze-your-design.html) contains a chapter on how to analyze the reports generated after the FPGA early image and FPGA image. + +#### FPGA Hardware +This is a full compile through to the FPGA hardware image. You can target the Intel® PAC with Intel Arria® 10 GX FPGA, the Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA, or a custom board. + +### Device Selectors +The following code snippet demonstrates how you can specify the target device in your source code. The selector is used to specify the target device at runtime. + +```c++ +// FPGA device selectors are defined in this utility header +#include + +int main() { + // Select either: + // - the FPGA emulator device (CPU emulation of the FPGA) + // - the FPGA device (a real FPGA) +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector device_selector; +#else + intel::fpga_selector device_selector; +#endif + + queue q(device_selector); + ... +} +``` +Notice that the FPGA emulator and the FPGA are are different target devices. It is recommended to use a preprocessor define to choose between the emulator and FPGA selectors. This makes it easy to switch between targets using only command-line flags. Since the FPGA only supports ahead-of-time compilation, dynamic selectors (such as the default_selector) are less useful than explicit selectors when targeting FPGA. + + +### Compiler Flags +Here is a cheat sheet of the DPC++ compiler commands to compile for the FPGA emulator, to generate the FPGA early image optimization reports, and to compile for FPGA hardware. +``` +# FPGA emulator +dpcpp -fintelfpga -DFPGA_EMULATOR fpga_compile.cpp -o fpga_compile.fpga_emu + +# Optimization report (default board) +dpcpp -fintelfpga -Xshardware -fsycl-link fpga_compile.cpp -o fpga_compile_report.a +# Optimization report (explicit board) +dpcpp -fintelfpga -Xshardware -fsycl-link -Xsboard=intel_s10sx_pac:pac_s10 fpga_compile.cpp -o fpga_compile_report.a + +# FPGA hardware (default board) +dpcpp -fintelfpga -Xshardware fpga_compile.cpp -o fpga_compile.fpga +# FPGA hardware (explicit board) +dpcpp -fintelfpga -Xshardware -Xsboard=intel_s10sx_pac:pac_s10 fpga_compile.cpp -o fpga_compile.fpga +``` + +The compiler flags used to achieve this are explained below. +| Flag | Explanation +--- |--- +| `-fintelfpga` | Perform ahead-of-time compilation for FPGA. +| `-DFPGA_EMULATOR` | Adds a preprocessor define (see code snippet above). +| `-Xshardware` | `-Xs` is used to pass arguments to the FPGA backend.
Since emulator is the default FPGA target, you must pass `Xshardware` to instruct the compiler to target FPGA hardware. +| `-Xsboard` | Optional argument to specify the FPGA board target.
If omitted, a default FPGA board is chosen. +| `-fsycl-link` | This is synonymous with `-fsycl-link=early`.
It instructs the compile to stop after creating the FPGA early image (and associated optimization report). + +Notice that whether you are targeting the FPGA emulator or FPGA hardware must be specified twice: through compiler flags for the ahead-of-time compilation, and through the device selector for the runtime. + + +## Key Concepts +* How and why compiling DPC++ to FPGA differs from CPU or GPU +* FPGA device image types and when to use them +* The compile flags used to target FPGA + +## License +This code sample is licensed under MIT license. + + +## Building the `fpga_compile` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for [emulation](#fpga-emulator) (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the [optimization report](#optimization-report): + ``` + make report + ``` + * Compile for [FPGA hardware](#fpga-hardware) (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + + +## Examining the Reports +Locate `report.html` in the `fpga_compile_report.prj/reports/` or `fpga_compile_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*. + +Browse the reports that were generated for the `VectorAdd` kernel's FPGA early image. You may also wish to examine the reports generated by the full FPGA hardware compile and compare their contents. + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./fpga_compile.fpga_emu (Linux) + fpga_compile.fpga_emu.exe (Windows) + ``` +2. Run the sample on the FPGA device: + ``` + ./fpga_compile.fpga (Linux) + ``` + +### Example of Output +``` +PASSED: results are correct +``` diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/fpga_compile.sln b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/fpga_compile.sln new file mode 100755 index 0000000000..248072508d --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/fpga_compile.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.705 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "fpga_compile", "fpga_compile.vcxproj", "{6271F8A8-6391-4040-BE74-71DDBD75CB64}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {6271F8A8-6391-4040-BE74-71DDBD75CB64}.Debug|x64.ActiveCfg = Debug|x64 + {6271F8A8-6391-4040-BE74-71DDBD75CB64}.Debug|x64.Build.0 = Debug|x64 + {6271F8A8-6391-4040-BE74-71DDBD75CB64}.Release|x64.ActiveCfg = Release|x64 + {6271F8A8-6391-4040-BE74-71DDBD75CB64}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {8122B579-CEB9-4397-AD32-FC1D48EE832E} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/fpga_compile.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/fpga_compile.vcxproj new file mode 100755 index 0000000000..2e4c2fb7aa --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/fpga_compile.vcxproj @@ -0,0 +1,160 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {6271f8a8-6391-4040-be74-71ddbd75cb64} + Win32Proj + fpga_compile + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)fpga_compile.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + true + -DFPGA_EMULATOR %(AdditionalOptions) + $(IntDir)fpga_compile.obj + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/sample.json new file mode 100755 index 0000000000..9fa4654c33 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/sample.json @@ -0,0 +1,51 @@ +{ + "guid": "A211FDE2-B037-4069-BD84-C45E354798B7", + "name": "Compiling DPC++ for FPGA", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "FPGA tutorial introducing how to compile DPC++ for FPGA.", + "toolchain": ["dpcpp"], + "os": ["linux", "windows"], + "targetDevice": ["FPGA"], + "builder": ["ide", "cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./fpga_compile.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ], + "windows": [ + { + "id": "fpga_emu", + "steps": [ + "cd src", + "ninja fpga_emu", + "fpga_compile.fpga_emu.exe" + ] + }, + { + "id": "report", + "steps": [ + "cd src", + "ninja report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/CMakeLists.txt new file mode 100755 index 0000000000..4fa57ebc9c --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/CMakeLists.txt @@ -0,0 +1,89 @@ +set(SOURCE_FILE fpga_compile.cpp) +set(TARGET_NAME fpga_compile) + +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS "-fintelfpga") + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR ") +set(EMULATOR_LINK_FLAGS "-fintelfpga ") + +# fpga emulator +if(WIN32) + set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe) + add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET}) + separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}") + add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET} + DEPENDS ${SOURCE_FILE}) + +else() + add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) + add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) + set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) +endif() + +# fpga +if(WIN32) + add_custom_target(fpga + COMMAND echo "FPGA hardware flow is not supported in Windows") +else() + add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_custom_target(fpga DEPENDS ${FPGA_TARGET}) + set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS}) + set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) +endif() + +# generate report +if(WIN32) + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) + +else() + set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a) + add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE}) + + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) + + separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") + add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE} + DEPENDS ${SOURCE_FILE}) +endif() + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/build.ninja new file mode 100755 index 0000000000..9dee50b9f6 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/build.ninja @@ -0,0 +1,30 @@ +source_file = fpga_compile.cpp +target_name = fpga_compile + +emulator_target = ${target_name}.fpga_emu.exe +report_target = ${target_name}_report.a +report_target_s10_pac = ${target_name}_s10_pac_report.a + +hardware_flags = -fintelfpga -Xshardware +emulator_flags = -fintelfpga -DFPGA_EMULATOR + +rule build_fpga_emu + command = dpcpp /GX ${emulator_flags} $in -o $out + +rule gen_report + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out + +rule gen_report_s10_pac + command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out + +# FPGA emulator +build fpga_emu: phony ${emulator_target} +build ${emulator_target}: build_fpga_emu ${source_file} + +# report +build report: phony ${report_target} +build ${report_target}: gen_report ${source_file} + +# report (S10 PAC) +build report_s10_pac: phony ${report_target_s10_pac} +build ${report_target_s10_pac}: gen_report_s10_pac ${source_file} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/fpga_compile.cpp b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/fpga_compile.cpp new file mode 100755 index 0000000000..d0e1dcb963 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/fpga_compile.cpp @@ -0,0 +1,118 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include +#include +#include "dpc_common.hpp" + +using namespace sycl; + +// Vector size for this example +constexpr size_t kSize = 1024; + +// Forward declaration of the kernel name +// (This will become unnecessary in a future compiler version.) +class VectorAdd; + + +int main() { + + // Set up three vectors and fill two with random values. + std::vector vec_a(kSize), vec_b(kSize), vec_r(kSize); + for (int i = 0; i < kSize; i++) { + vec_a[i] = rand(); + vec_b[i] = rand(); + } + + // Select either: + // - the FPGA emulator device (CPU emulation of the FPGA) + // - the FPGA device (a real FPGA) +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector device_selector; +#else + intel::fpga_selector device_selector; +#endif + + try { + + // Create a queue bound to the chosen device. + // If the device is unavailable, a SYCL runtime exception is thrown. + queue q(device_selector, dpc_common::exception_handler); + + // Print out the device information. + std::cout << "Running on device: " + << q.get_device().get_info() << "\n"; + + { + // Create buffers to share data between host and device. + // The runtime will copy the necessary data to the FPGA device memory + // when the kernel is launched. + buffer buf_a(vec_a); + buffer buf_b(vec_b); + // Use verbose SYCL 1.2 syntax for the output buffer. + // (This will become unnecessary in a future compiler version.) + buffer buf_r(vec_r.data(), kSize); + + + // Submit a command group to the device queue. + q.submit([&](handler& h) { + + // The SYCL runtime uses the accessors to infer data dependencies. + // A "read" accessor must wait for data to be copied to the device + // before the kernel can start. A "write discard" accessor does not. + auto a = buf_a.get_access(h); + auto b = buf_b.get_access(h); + auto r = buf_r.get_access(h); + + // The kernel uses single_task rather than parallel_for. + // The task's for loop is executed in pipeline parallel on the FPGA, + // exploiting the same parallelism as an equivalent parallel_for. + h.single_task([=]() { + for (int i = 0; i < kSize; ++i) { + r[i] = a[i] + b[i]; + } + }); + }); + + // The buffer destructor is invoked when the buffers pass out of scope. + // buf_r's destructor updates the content of vec_r on the host. + } + + // The queue destructor is invoked when q passes out of scope. + // q's destructor invokes q's exception handler on any device exceptions. + } + catch (sycl::exception const& e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + + // Check the results. + int correct = 0; + for (int i = 0; i < kSize; i++) { + if ( vec_r[i] == vec_a[i] + vec_b[i] ) { + correct++; + } + } + + // Summarize and return. + if (correct == kSize) { + std::cout << "PASSED: results are correct\n"; + } else { + std::cout << "FAILED: results are incorrect\n"; + } + + return !(correct == kSize); +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/CMakeLists.txt new file mode 100755 index 0000000000..35161c6113 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(SystemProfiling) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/README.md new file mode 100755 index 0000000000..12960b3317 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/README.md @@ -0,0 +1,300 @@ + +# Using the Intercept Layer for OpenCL* Applications to Identify Optimization Opportunities +This FPGA tutorial demonstrates how to use the Intercept Layer for OpenCL* Applications to perform system-level profiling on a design and reveal areas for improvement. + +***Documentation***: The [Intercept Layer for OpenCL* Applications](https://github.com/intel/opencl-intercept-layer) GitHub provides complete documentation for the use of this tool. The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + + + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | Summary of profiling tools available for performance optimization
About the Intercept Layer for OpenCL* Applications
How to set up and use this tool
A case study of using this tool to identify when the double buffering system-level optimization is beneficial +| Time to complete | 30 minutes + +_Notice: Tutorial is not supported on Windows* as compiling to FPGA hardware is not yet supported in Windows*_ + +## Purpose +This FPGA tutorial demonstrates how to use the Intercept Layer for OpenCL* Applications, an open-source tool, to perform system-level profiling on a design and reveal areas for improvement. + +### Profiling Techniques +The following code snippet uses standard SYCL* and C++ language features to extract profiling information from DPC++ code. + +```c++ +void profiling_example(const std::vector& vec_in, + std::vector& vec_out ) { + + // Start the timer (using std::chrono) + dpc_common::TimeInterval exec_time; + + // Host performs pre-processing of input data + std::vector vec_pp = PreProcess(vec_in); + + // FPGA device performs additional processing + intel::fpga_selector selector; + queue q(selector, dpc_common::exception_handler, + property::queue::enable_profiling{}); + + buffer buf_in(vec_pp); + buffer buf_out(vec_out); + + event e = q.submit([&](handler &h) { + auto acc_in = buf_in.get_access(h); + auto acc_out = buf_out.get_access(h); + + h.single_task([=]() [[intel::kernel_args_restrict]] { + DeviceProcessing(acc_in, acc_out); + }); + }); + + // Query event e for kernel profiling information + // (blocks until command groups associated with e complete) + double kernel_time_ns = + e.get_profiling_info() - + e.get_profiling_info(); + + // Stop the timer. + double total_time_s = exec_time.Elapsed(); + + // Report profiling info + std::cout << "Kernel compute time: " << kernel_time_ns * 1e-6 << " ms\n"; + std::cout << "Total compute time: " << total_time_s * 1e3 << " ms\n"; +} +``` + +This tutorial introduces the Intercept Layer for OpenCL* Applications, a profiling tool that extracts and visualizes system-level profiling information for DPC++ programs. This tool can extract the same profiling data (and more) as the code snippet above, without requiring any code-level profiling directives. + +The Intercept Layer for OpenCL* provides coarse-grained, system-level profiling information. A complementary tool, the Intel® VTune™ Profiler, provides fine-grained profiling information for the kernels executing on the device. Together, these two tools can be used to optimize both host and device side execution. + +### The Intercept Layer for OpenCL* Applications + +The Intercept Layer for OpenCL* Applications is an open-source tool that you can use to profile DPC++ designs at a system-level. Although it is not part of the oneAPI Base Toolkit installation, it is freely available on GitHub. + +This tool serves the following purpose: +* Intercept host calls before they reach the device in order to gather performance data and log host calls. +* Provide data to visualize the calls through time, and can separate them into *queued*, *submitted*, and *execution* sections for a better understanding of the execution. +* Identify gaps (using visualization) in the runtime that may be leading to inefficient execution and throughput drops. + +The Intercept Layer for OpenCL* Applications has several different options for capturing different aspects of the host run. These options are described in its [documentation](https://github.com/intel/opencl-intercept-layer). This tutorial uses the call-logging and device timeline features that print information about the calls made by the host during execution. + +### Data Visualization + +You can visualize the data generated by the Intercept Layer for OpenCL* Applications in the following ways: +* __Google* Chrome* trace event profiling tool__: JSON files generated by the Intercept Layer for OpenCL Applications contain device timeline information. You can open these JSON files in the [Google* Chrome* trace event profiling tool](chrome://tracing/) to generate visual representation of the profiling data. +* __Microsoft* Excel*__: The Intercept Layer for OpenCL* Applications contains a Python script that parses the timeline information into a Microsoft* Excel* file, where it is presented both in a table format and in a bar graph. + +This tutorial will use the Google* Chrome trace event profiling tool for visualization. + +Use the visualized data to identify gaps in the runtime where events are waiting for something else to finish executing. These gaps represent potential opportunities for system-level optimization. While it is not possible to eliminate all such gaps, you might be able to eliminate those caused by dependencies that can be avoided. + +### Tutorial Example: Double Buffering + +This tutorial is based on the *double-buffering* optimization. Double-buffering allows host data processing and host transfers to the device-side buffer to occur in parallel with the kernel execution on the FPGA device. This parallelization is useful when the host performs any combination of the following actions between consecutive kernel runs: +* Preprocessing +* Postprocessing +* Writes to the device buffer + +By running host and device actions in parallel, execution gaps between kernels are removed as they no longer have to wait for the host to finish its operation. You can clearly see the benefits of double-buffering with the visualizations provided by the Intercept Layer output. + +### Setting up the Intercept Layer for OpenCL* Applications +The Intercept Layer for OpenCL* Applications is available on GitHub at the following URL: + +To set up the Intercept Layer for OpenCL* Applications, perform the following steps: + +1) [Download](https://github.com/intel/opencl-intercept-layer) the Intercept Layer for OpenCL* Applications version 2.2.1 or later from GitHub. + + +2) Build the Intercept Layer according to the instructions provided in [How to Build the Intercept Layer for OpenCL* Applications](https://github.com/intel/opencl-intercept-layer/blob/master/docs/build.md). + * __Run `cmake`__: Ensure that you set `ENABLE_CLILOADER=1` when running cmake. + (i.e. `cmake -DENABLE_CLILOADER=1 ..` ) + * __Run `make`__: After the cmake step, `make` must be run in the build directory. This step builds the `cliloader` loader utility. + * __Add to your `PATH`__: The `cliloader` executable should now exist in `//cliloader/` directory. Add this directory to your `PATH` environment variable if you wish to run multiple designs using `cliloader`. + + You can now pass your executables to `cliloader` to run them with the intercept layer. For details about the `cliloader` loader utility, see [cliloader: A Intercept Layer for OpenCL* Applications Loader](https://github.com/intel/opencl-intercept-layer/blob/master/docs/cliloader.md). + +3) Set `cliloader` and other Intercept Layer options. + + If you run multiple designs with the same options, set up a `clintercept.conf` file in your home directory. You can also set the options as environment variables by prefixing the option name with `CLI_`. For example, the `DllName` option can be set through the `CLI_DllName` environment variable. For a list of options, see *Controls* in [How to Use the Intercept Layer for OpenCL Applications](https://github.com/intel/opencl-intercept-layer/blob/master/docs/controls.md). + + For this tutorial, set the following options: + +| Options/Variables | Description | +| --- | --- | +| `DllName=$CMPLR_ROOT/linux/lib/libOpenCL.so` | The intercept layer must know where `libOpenCL.so` file from the original oneAPI build is. | +| `DevicePerformanceTiming=1` and `DevicePerformanceTimelineLogging=1` | These options print out runtime timeline information in the output of the executable run. | +| `ChromePerformanceTiming=1`, `ChromeCallLogging=1`, `ChromePerformanceTimingInStages=1` | These variables set up the chrome tracer output, and ensure the output has Queued, Submitted, and Execution stages. | + + +These instructions set up the `cliloader` executable, which provides some flexibility by allowing for more control over when the layer is used or not used. If you prefer a local installation (for a single design) or a global installation (always ON for all designs), follow the instructions at [How to Install the Intercept Layer for OpenCL Applications](https://github.com/intel/opencl-intercept-layer/blob/master/docs/install.md). + +### Running the Intercept Layer for OpenCL* Applications + +To run a compiled DPC++ program using the Intercept Layer for OpenCL* Applications, use the command: +`cliloader [executable args]` + +To run the tutorial example, refer to the "[Running the Sample](#running-the-sample)" section. + +When you run the host executable with the `cliloader` command, the `stderr` output contains lines as shown in the following example: +``` +Device Timeline for clEnqueueWriteBuffer (enqueue 1) = 63267241140401 ns (queued), 63267241149579 ns (submit), 63267241194205 ns (start), 63267242905519 ns (end) +``` + +These lines give the timeline information about a variety of oneAPI runtime calls. After the host executable finishes running, there is also a summary of the performance information for the run. + +### Viewing the Performance Data + +After the executable runs, the data collected will be placed in the `CLIntercept_Dump` directory, which is in the home directory by default. Its location can be adjusted using the `DumpDir=` `cliloader` option. `CLIntercept_Dump` contains a file called `clintercept_trace.json`. You can load this JSON file in the [Google* Chrome trace event profiling tool](chrome://tracing/) to visualize the timeline data collected by the run. + +For this tutorial, this visualization appears as shown in the following example: + +![](full_example_trace.PNG) + +This visualization shows different calls executed through time. The X-axis is time, with the scale shown near the top of the page. The Y-axis shows different calls that are split up in several ways. + +The left side (Y-axis) has two different types of numbers: +* Numbers that contain a decimal point. + * The part of the number before the decimal point orders the calls approximately by start time. + * The part of the number after the decimal point represents the queue number the call was made in. +* Numbers that do not contain a decimal point. These numbers represent the thread ID of the thread being run on in the operating system. + +The colors in the trace represent different stages of execution: +* Blue during the *queued* stage +* Yellow during the *submitted* stage +* Orange for the *execution* stage + +Look for gaps between consecutive execution stages and kernel runs to identify possible areas for optimization. + + +### Applying Double-Buffering Using the Intercept Layer for OpenCL* Applications + +The double-buffering optimization can help minimize or remove gaps between consecutive kernels as they wait for host processing to finish. These gaps are minimized or removed by having the host perform processing operations on a second set of buffers while the kernel executes. With this execution order, the host processing is done by the time the next kernel can run, so kernel execution is not held up waiting for the host. + +For a more detailed explanation of the optimization, refer to the FPGA tutorial "Double Buffering to Overlap Kernel Execution with Buffer Transfers and Host Processing". + +In this tutorial, the first three kernels are run without the double-buffer optimization, and the next three are run with it. The kernels were run on an Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA when the intercept layer data was collected. The change made by this optimization can be clearly seen in the Intercept Layer for OpenCL* Applications trace: + +![](with_and_without_double_buffering.PNG) + +Here, the kernel runs named `_ZTS10SimpleVpow` can be recognized as the bars with the largest execution time (the large orange bars). Double buffering removes the gaps between the kernel executions that can be seen in the top trace image. This optimization improves the throughput of the design, as explained in the `double_buffering` tutorial. + +The Intercept Layer for OpenCL* Applications makes it clear why the double buffering optimization will benefit this design, and shows the performance improvement it achieves. Use the Intercept Layer tool on your designs to identify scenarios where you can apply double buffering and other system-level optimizations. + + +## Key Concepts +* A brief summary of the key profiling tools available for DPC++ performance optimization +* Understanding the Intercept Layer for OpenCL* Applications tool +* How to set up and use the Intercept Layer for OpenCL* Applications tool +* How to use the resulting information to identify opportunities for system-level optimizations such as double buffering + +## License +This code sample is licensed under MIT license. + +## Building the Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./double_buffering.fpga_emu (Linux) + ``` +2. Run the sample on the FPGA device: + ``` + ./double_buffering.fpga (Linux) + ``` +3. Follow the instructions in the "[Setting up the Intercept Layer for OpenCL* Applications](#setting-up-the-intercept-layer-for-opencl-applications)" section to install and configure the `cliloader` tool. +4. Run the sample using the Intercept Layer for OpenCL* Applications to obtain system-level profiling information: + ``` + cliloader ./double_buffering.fpga (Linux) + ``` +5. Follow the instructions in the "[Viewing the Performance Data](#viewing-the-performance-data)" section to visualize the results. + +### Example of Output +__Intercept Layer for OpenCL* Applications results:__ +Your visualization results should resemble the screenshots in sections "[Viewing the Performance Data](#viewing-the-performance-data)" and "[Applying Double-Buffering Using the Intercept Layer for OpenCL* Applications](#applying-double-buffering-using-the-intercept-layer-for-opencl-applications)". + +__Command line `stdout`:__ +When run without `cliloader`, the tutorial output should resemble the result below. +``` +Platform name: Intel(R) FPGA SDK for OpenCL(TM) +Device name: pac_a10 : Intel PAC Platform (pac_ee00000) + +Executing kernel 100 times in each round. + +*** Beginning execution, without double buffering +Launching kernel #0 +Launching kernel #10 +Launching kernel #20 +Launching kernel #30 +Launching kernel #40 +Launching kernel #50 +Launching kernel #60 +Launching kernel #70 +Launching kernel #80 +Launching kernel #90 + +Overall execution time without double buffering = 29742 ms +Total kernel-only execution time without double buffering = 17856 ms +Throughput = 35.255249 MB/s + +*** Beginning execution, with double buffering. +Launching kernel #0 +Launching kernel #10 +Launching kernel #20 +Launching kernel #30 +Launching kernel #40 +Launching kernel #50 +Launching kernel #60 +Launching kernel #70 +Launching kernel #80 +Launching kernel #90 + +Overall execution time with double buffering = 17967 ms +Total kernel-only execution time with double buffering = 17869 ms +Throughput = 58.35976 MB/s + +Verification PASSED +``` diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/full_example_trace.PNG b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/full_example_trace.PNG new file mode 100755 index 0000000000..92d37fc5dc Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/full_example_trace.PNG differ diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/sample.json new file mode 100755 index 0000000000..c32c6f4f65 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/sample.json @@ -0,0 +1,25 @@ +{ + "guid": "9D7E5A6D-A39B-4FF8-B553-4B85116FCD69", + "name": "Using the OpenCL Intercept Layer to Profile Designs running on the FPGA", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "FPGA tutorial demonstrating how to use the OpenCL Intercept Layer to improve a design with the double buffering optimization", + "toolchain": ["dpcpp"], + "os": ["linux"], + "targetDevice": ["FPGA"], + "builder": ["cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./double_buffering.fpga_emu" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/src/CMakeLists.txt new file mode 100755 index 0000000000..78877c0592 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/src/CMakeLists.txt @@ -0,0 +1,52 @@ +set(SOURCE_FILE double_buffering.cpp) +set(TARGET_NAME double_buffering) +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HARDWARE_COMPILE_FLAGS "-fintelfpga") + +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR") +set(EMULATOR_LINK_FLAGS "-fintelfpga") + +# fpga emulator +add_executable(${EMULATOR_TARGET} ${SOURCE_FILE}) +add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) +set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS}) +set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS}) + +# fpga +add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE}) +add_custom_target(fpga DEPENDS ${FPGA_TARGET}) +set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS}) +set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS}) + + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}.fpga_emu + DEPENDS ${TARGET_NAME}.fpga_emu) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/src/double_buffering.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/src/double_buffering.cpp new file mode 100755 index 0000000000..9884295b08 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/src/double_buffering.cpp @@ -0,0 +1,353 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include +#include + +#include "dpc_common.hpp" + +using namespace sycl; + +// For the system_profiling tutorial, we execute the kernel only a few times. +// This makes it easier to examine the generated profiling graphs. +// Note that the performance advantage of double buffering is more apparent on +// FPGA hardware with a larger number of kernel invocations. + +// kTimes = # times to execute the kernel. kTimes must be >= 2 +// kSize = # of floats to process on each kernel execution. +#if defined(FPGA_EMULATOR) +constexpr int kTimes = 3; +constexpr int kSize = 4096; +#else +constexpr int kTimes = 3; // originally 100 +constexpr int kSize = 2621440; +#endif + +// Kernel executes a power function (base^kPow). Must be +// >= 2. Can increase this to increase kernel execution +// time, but ProcessOutput() time will also increase. +constexpr int kPow = 20; + +// Number of iterations through the main loop +constexpr int kNumRuns = 2; + +bool pass = true; + +class SimpleVpow; + +/* Kernel function. + Performs buffer_b[i] = buffer_a[i] ** pow + Only supports pow >= 2. + This kernel is not meant to be an optimal implementation of the power + operation -- it's just a sample kernel for this tutorial whose execution time + is easily controlled via the pow parameter. SYCL buffers are created + externally and passed in by reference to control (external to this function) + when the buffers are destructed. The destructor causes a blocking buffer + transfer from device to host and double buffering requires us to not block + here (because we need to launch another kernel). So we only want this + transfer to occur at the end of overall execution, not at the end of each + individual kernel execution. +*/ +void SimplePow(std::unique_ptr &q, buffer &buffer_a, + buffer &buffer_b, event &e) { + // Submit to the queue and execute the kernel + e = q->submit([&](handler &h) { + // Get kernel access to the buffers + auto accessor_a = buffer_a.get_access(h); + auto accessor_b = buffer_b.get_access(h); + + const int num = kSize; + assert(kPow >= 2); + const int p = kPow - 1; // Assumes pow >= 2; + + h.single_task([=]() [[intel::kernel_args_restrict]] { + for (int j = 0; j < p; j++) { + if (j == 0) { + for (int i = 0; i < num; i++) { + accessor_b[i] = accessor_a[i] * accessor_a[i]; + } + } else { + for (int i = 0; i < num; i++) { + accessor_b[i] = accessor_b[i] * accessor_a[i]; + } + } + } + }); + }); + + event update_host_event; + update_host_event = q->submit([&](handler &h) { + auto accessor_b = buffer_b.get_access(h); + + /* + Explicitly instruct the SYCL runtime to copy the kernel's output buffer + back to the host upon kernel completion. This is not required for + functionality since the buffer access in ProcessOutput() also implicitly + instructs the runtime to copy the data back. But it should be noted that + this buffer access blocks ProcessOutput() until the kernel is complete + and the data is copied. In contrast, update_host() instructs the runtime + to perform the copy earlier. This allows ProcessOutput() to optionally + perform more useful work *before* making the blocking buffer access. Said + another way, this allows ProcessOutput() to potentially perform more work + in parallel with the runtime's copy operation. + */ + h.update_host(accessor_b); + }); +} + +// Returns kernel execution time for a given SYCL event from a queue. +ulong SyclGetExecTimeNs(event e) { + ulong start_time = + e.get_profiling_info(); + ulong end_time = + e.get_profiling_info(); + return (end_time - start_time); +} + +// Local pow function for verifying results +float MyPow(float input, int pow) { + return (pow == 0) ? 1 : input * MyPow(input, pow - 1); +} + +/* Compares kernel output against expected output. Only compares part of the + output so that this method completes quickly. This is done + intentionally/artificially keep host-processing time shorter than kernel + execution time. Grabs kernel output data from its SYCL buffer. Reading from + this buffer is a blocking operation that will block on the kernel completing. + Queries and records execution time of the kernel that just completed. This + is a natural place to do this because ProcessOutput() is blocked on kernel + completion. +*/ +void ProcessOutput(buffer &input_buf, + buffer &output_buf, int exec_number, event e, + ulong &total_kernel_time_per_slot) { + auto input_buf_acc = input_buf.get_access(); + auto output_buf_acc = output_buf.get_access(); + int num_errors = 0; + int num_errors_to_print = 10; + /* The use of update_host() in the kernel function allows for additional + host-side operations to be performed here, in parallel with the buffer copy + operation from device to host, before the blocking access to the output + buffer is made via output_buf_acc[]. To be clear, no real operations are + done here and this is just a note that this is the place + where you *could* do it. */ + for (int i = 0; i < kSize / 8; i++) { + const bool out_valid = (MyPow(input_buf_acc[i], kPow) != output_buf_acc[i]); + if ((num_errors < num_errors_to_print) && out_valid) { + if (num_errors == 0) { + pass = false; + std::cout << "Verification failed on kernel execution # " << exec_number + << ". Showing up to " << num_errors_to_print + << " mismatches.\n"; + } + std::cout << "Verification failed on kernel execution # " << exec_number + << ", at element " << i << ". Expected " << std::fixed + << std::setprecision(16) << MyPow(input_buf_acc[i], kPow) + << " but got " << output_buf_acc[i] << "\n"; + num_errors++; + } + } + + // At this point we know the kernel has completed, + // so can query the profiling data. + total_kernel_time_per_slot += SyclGetExecTimeNs(e); +} + +/* + Generates input data for the next kernel execution. Only fills part of the + buffer so that this method completes quickly. This is done + intentionally/artificially keep host-processing time shorter than kernel + execution time. Writes the data into the associated SYCL buffer. The write + will block until the previous kernel execution, that is using this buffer, + completes. +*/ +void ProcessInput(buffer &buf) { + // We are generating completely new input data, so can use discard_write() + // here to indicate we don't care about the SYCL buffer's current contents. + auto buf_acc = buf.get_access(); + + // RNG seed + auto seed = std::chrono::system_clock::now().time_since_epoch().count(); + + // RNG engine + std::default_random_engine dre(seed); + + // generate random numbers between 1 and 2 + std::uniform_real_distribution di(1.0f, 2.0f); + + // Randomly generate a start value and increment from there. + // Compared to randomly generating every value, this is done to + // speed up this function a bit. + float start_val = di(dre); + + for (int i = 0; i < kSize / 8; i++) { + buf_acc[i] = start_val; + start_val++; + } +} + +int main() { +// Create queue, get platform and device +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector device_selector; + std::cout << "\nEmulator output does not demonstrate true hardware " + "performance. The design may need to run on actual hardware " + "to observe the performance benefit of the optimization " + "exemplified in this tutorial.\n\n"; +#else + intel::fpga_selector device_selector; +#endif + + try { + auto prop_list = + property_list{property::queue::enable_profiling()}; + + std::unique_ptr q; + q.reset(new queue(device_selector, dpc_common::exception_handler, prop_list)); + + platform platform = q->get_context().get_platform(); + device device = q->get_device(); + std::cout << "Platform name: " + << platform.get_info().c_str() << "\n"; + std::cout << "Device name: " + << device.get_info().c_str() << "\n\n\n"; + + std::cout << "Executing kernel " << kTimes << " times in each round.\n\n"; + + // Create a vector to store the input/output SYCL buffers + std::vector> input_buf; + std::vector> output_buf; + + // SYCL events for each kernel launch. + event sycl_events[2]; + + // In nanoseconds. Total execution time of kernels in a given slot. + ulong total_kernel_time_per_slot[2]; + + // Total execution time of all kernels. + ulong total_kernel_time = 0; + + // Allocate vectors to store the host-side copies of the input data + // Create and allocate the SYCL buffers + for (int i = 0; i < 2; i++) { + input_buf.push_back(buffer(range<1>(kSize))); + output_buf.push_back(buffer(range<1>(kSize))); + } + + /* + Main loop. This loop runs twice to show the performance difference without + and with double buffering. + */ + for (int i = 0; i < kNumRuns; i++) { + for (int i = 0; i < 2; i++) { + total_kernel_time_per_slot[i] = 0; // Initialize timers to zero. + } + + switch (i) { + case 0: { + std::cout << "*** Beginning execution, without double buffering\n"; + break; + } + case 1: { + std::cout << "*** Beginning execution, with double buffering.\n"; + break; + } + default: { + std::cout << "*** Beginning execution.\n"; + } + } + + // Start the timer. This will include the time to process the input data + // for the first 2 kernel executions. + dpc_common::TimeInterval exec_time; + + if (i == 0) { // Single buffering + for (int i = 0; i < kTimes; i++) { + // Only print every few iterations, just to limit the prints. + if (i % 10 == 0) { + std::cout << "Launching kernel #" << i << "\n"; + } + + ProcessInput(input_buf[0]); + SimplePow(q, input_buf[0], output_buf[0], sycl_events[0]); + ProcessOutput(input_buf[0], output_buf[0], i, sycl_events[0], + total_kernel_time_per_slot[0]); + } + } else { // Double buffering + // Process input for first 2 kernel launches and queue them. Then block + // on processing the output of the first kernel. + ProcessInput(input_buf[0]); + ProcessInput(input_buf[1]); + + std::cout << "Launching kernel #0\n"; + + SimplePow(q, input_buf[0], output_buf[0], sycl_events[0]); + for (int i = 1; i < kTimes; i++) { + if (i % 10 == 0) { + std::cout << "Launching kernel #" << i << "\n"; + } // Only print every few iterations, just to limit the prints. + + // Launch the next kernel + SimplePow(q, input_buf[i % 2], output_buf[i % 2], sycl_events[i % 2]); + + // Process output from previous kernel. This will block on kernel + // completion. + ProcessOutput(input_buf[(i - 1) % 2], output_buf[(i - 1) % 2], i, + sycl_events[(i - 1) % 2], + total_kernel_time_per_slot[(i - 1) % 2]); + + // Generate input for the next kernel. + ProcessInput(input_buf[(i - 1) % 2]); + } + + // Process output of the final kernel + ProcessOutput(input_buf[(kTimes - 1) % 2], output_buf[(kTimes - 1) % 2], + i, sycl_events[(kTimes - 1) % 2], + total_kernel_time_per_slot[(kTimes - 1) % 2]); + } + + // Add up the overall kernel execution time. + total_kernel_time = 0; + for (int i = 0; i < 2; i++) { + total_kernel_time += total_kernel_time_per_slot[i]; + } + + // Stop the timer. + double time_span = exec_time.Elapsed(); + + std::cout << "\nOverall execution time " + << ((i == 0) ? "without" : "with") << " double buffering = " + << (unsigned)(time_span * 1000) << " ms\n"; + std::cout << "Total kernel-only execution time " + << ((i == 0) ? "without" : "with") << " double buffering = " + << (unsigned)(total_kernel_time / 1000000) << " ms\n"; + std::cout << "Throughput = " << std::setprecision(8) + << (float)kSize * (float)kTimes * (float)sizeof(float) / + (float)time_span / 1000000 + << " MB/s\n\n\n"; + } + if (pass) { + std::cout << "Verification PASSED\n"; + } else { + std::cout << "Verification FAILED\n"; + return 1; + } + } catch (sycl::exception const& e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + return 0; +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/with_and_without_double_buffering.PNG b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/with_and_without_double_buffering.PNG new file mode 100755 index 0000000000..dffc959919 Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/with_and_without_double_buffering.PNG differ diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/CMakeLists.txt new file mode 100755 index 0000000000..96498624f9 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_COMPILER "dpcpp") + +cmake_minimum_required (VERSION 2.8) + +project(UseLibrary) + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +add_subdirectory (src) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/License.txt new file mode 100755 index 0000000000..e63c6e13dc --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/README.md new file mode 100755 index 0000000000..f713db02bb --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/README.md @@ -0,0 +1,126 @@ + +# Using FPGA Cross-Language Libraries +This FPGA tutorial demonstrates how to build DPC++ device libraries from various sources and use them in your DPC++ design. + +***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. + +| Optimized for | Description +--- |--- +| OS | Linux* Ubuntu* 18.04 +| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA +| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit +| What you will learn | How to create and use libraries in DPC++ FPGA projects
How power users can incorporate RTL source code in DPC++ for FPGA +| Time to complete | 15 minutes + +_Notice: The FPGA library feature is not yet supported in Windows*_ + +## Purpose +This FPGA tutorial demonstrates how to build DPC++ device libraries from multiple sources and use them in your DPC++ design. A library is useful for reusing and sharing code, or for separating code for testing purposes. Power users can also use libraries to leverage the features of other programming languages in their DPC++ FPGA designs. + +It is currently possible to generate FPGA library objects from the following source types: +* Verilog or VHDL (modules or entities respectively) +* Intel® High Level Synthesis Compiler (HLS) functions +* OpenCL* 1.2 functions +* SYCL* or DPC++ functions + +This code sample uses libraries from all four supported source types within a single project. + +### Generating a library +To create a library from source code, use the following steps: + +1. `fpga_crossgen` creates object file that contains representations for target devices (FPGA) and FPGA emulator. The following commands instruct `fpga_crossgen` to generate DPC++ target objects from the four sources in this tutorial: + + ``` + fpga_crossgen lib_hls.cpp --source hls --target sycl -o lib_hls.o + fpga_crossgen lib_ocl.cl --source ocl --target sycl -o lib_ocl.o + fpga_crossgen lib_sycl.cpp --source sycl --target sycl -o lib_sycl.o + fpga_crossgen lib_rtl_spec.xml --emulation_model lib_rtl_model.cpp --target sycl -o lib_rtl.o + ``` + Notice that generating an RTL library requires that an `xml` file and emulation model be provided in addition to the Verilog source code. Examine the tutorial source code and the comments in `use_library.cpp` for more details. +2. `fpga_libtool` collects one or more objects into a DPC++ library archive file. This command creates a single library archive file from the four object files generated by `fpga_crossgen` in the previous step: + + ``` + fpga_libtool lib_hls.o lib_ocl.o lib_rtl.o lib_sycl.o --target sycl --create lib.a + ``` +### Using the library +To use the generated library in your project, simply add the generated library archive file to the list of input source files when invoking `dpcpp`. To compile the `use_library` tutorial, pass both `use_library.cpp` and `lib.a` as inputs. +``` +# Compile for FPGA emulator +dpcpp -fintelfpga use_library.cpp lib.a -o use_library_emu.fpga -DFPGA_EMULATOR + +# Compile for FPGA hardware +dpcpp -fintelfpga use_library.cpp lib.a -o use_library.fpga -Xshardware +``` + + +## Key Concepts +* How to create and use libraries in DPC++ FPGA projects +* How power users can incorporate RTL source code in DPC++ for FPGA + +## License +This code sample is licensed under MIT license. + + +## Building the `use_library` Tutorial + +### Include Files +The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +### Running Samples in DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)). + +When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h. + +### On a Linux* System + +1. Generate the `Makefile` by running `cmake`. + ``` + mkdir build + cd build + ``` + To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command: + ``` + cmake .. + ``` + Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command: + + ``` + cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10 + ``` + +2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow: + + * Compile for emulation (fast compile time, targets emulated FPGA device): + ``` + make fpga_emu + ``` + * Generate the optimization report: + ``` + make report + ``` + * Compile for FPGA hardware (longer compile time, targets FPGA device): + ``` + make fpga + ``` +3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here. + + ### In Third-Party Integrated Development Environments (IDEs) + +You can compile and run this tutorial in the Eclipse* IDE (in Linux*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide) + + +## Running the Sample + + 1. Run the sample on the FPGA emulator (the kernel executes on the CPU): + ``` + ./use_library.fpga_emu (Linux) + ``` +2. Run the sample on the FPGA device: + ``` + ./use_library.fpga (Linux) + ``` + +### Example of Output +``` +PASSED: result is correct! +``` diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/sample.json new file mode 100755 index 0000000000..a9b38b95f2 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/sample.json @@ -0,0 +1,34 @@ +{ + "guid": "9605DCBF-6DDB-4FD2-812F-1ECF252AE334", + "name": "Using FPGA Cross-Language Libraries", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"], + "description": "Tutorial demonstrating how to create FPGA libraries and to incorporate them in a DPC++ project", + "toolchain": ["dpcpp"], + "os": ["linux"], + "targetDevice": ["FPGA"], + "builder": ["cmake"], + "languages": [{"cpp":{}}], + "ciTests": { + "linux": [ + { + "id": "fpga_emu", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make fpga_emu", + "./use_library.fpga_emu" + ] + }, + { + "id": "report", + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make report" + ] + } + ] + } +} diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/CMakeLists.txt new file mode 100755 index 0000000000..0f6889708b --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/CMakeLists.txt @@ -0,0 +1,133 @@ +set(SOURCE_FILE use_library.cpp) +set(HEADER_FILE lib.hpp) +set(TARGET_NAME use_library) +set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu) +set(FPGA_TARGET ${TARGET_NAME}.fpga) +set(REPORT_TARGET ${TARGET_NAME}_report.a) + +# Intel supported FPGA Boards and their names +set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10") +set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10") + +# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA +SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME}) + +# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA +IF (NOT DEFINED FPGA_BOARD) + MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.") + +ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME}) + MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.") + SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME}) + +ELSE() + MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.") +ENDIF() + +set(HLS_SOURCE lib_hls.cpp) +set(HLS_SOURCE_OBJECT lib_hls.o) + +set(OCL_SOURCE lib_ocl.cl) +set(OCL_SOURCE_OBJECT lib_ocl.o) + +set(SYCL_SOURCE lib_sycl.cpp) +set(SYCL_SOURCE_OBJECT lib_sycl.o) + +set(RTL_C_MODEL lib_rtl_model.cpp) +set(RTL_SPEC lib_rtl_spec.xml) +set(RTL_V lib_rtl.v) +set(RTL_SOURCE_OBJECT lib_rtl.o) + +set(LIBRARY_ARCHIVE lib.a) + +set(LIBRARY_DEVICE_LINK_FLAGS "${LIBRARY_ARCHIVE}") +set(LIBRARY_HOST_LINK_FLAGS "${HLS_SOURCE_OBJECT} ${OCL_SOURCE_OBJECT} ${SYCL_SOURCE_OBJECT} ${RTL_SOURCE_OBJECT}") + +set(HARDWARE_COMPILE_FLAGS "-fintelfpga") +# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation +set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${LIBRARY_DEVICE_LINK_FLAGS} ${USER_HARDWARE_FLAGS}") + +set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR") +set(EMULATOR_LINK_FLAGS "-fintelfpga ${LIBRARY_DEVICE_LINK_FLAGS}") + +#create hls source object +add_custom_target( + create_hls_source_object + COMMAND fpga_crossgen ${HLS_SOURCE} --source hls --target sycl -o ${HLS_SOURCE_OBJECT} ${CMAKE_CXX_FLAGS} + ) + +#create ocl source object +add_custom_target( + create_ocl_source_object + COMMAND fpga_crossgen ${OCL_SOURCE} --source ocl --target sycl -o ${OCL_SOURCE_OBJECT} + ) + +#create sycl source object +add_custom_target( + create_sycl_source_object + COMMAND fpga_crossgen ${SYCL_SOURCE} --source sycl --target sycl -o ${SYCL_SOURCE_OBJECT} ${CMAKE_CXX_FLAGS} + ) + +#create rtl source object +add_custom_target( + create_rtl_source_object + COMMAND fpga_crossgen ${RTL_SPEC} --emulation_model ${RTL_C_MODEL} --target sycl -o ${RTL_SOURCE_OBJECT} + ) + +#create library achive +add_custom_target( + create_library_archive + COMMAND fpga_libtool ${HLS_SOURCE_OBJECT} ${OCL_SOURCE_OBJECT} ${SYCL_SOURCE_OBJECT} ${RTL_SOURCE_OBJECT} --target sycl --create ${LIBRARY_ARCHIVE} + DEPENDS create_hls_source_object create_ocl_source_object create_sycl_source_object create_rtl_source_object + ) + +# fpga emulator +set(SOURCE_OBJ_FILE_EMU ${SOURCE_FILE}.emu.o) +add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET}) +separate_arguments(EMULATOR_COMPILE_FLAGS_LIST UNIX_COMMAND "${EMULATOR_COMPILE_FLAGS}") +add_custom_command(OUTPUT ${SOURCE_OBJ_FILE_EMU} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${SOURCE_FILE} ${EMULATOR_COMPILE_FLAGS_LIST} -c -o ${SOURCE_OBJ_FILE_EMU} + DEPENDS ${SOURCE_FILE} ${HEADER_FILE}) +separate_arguments(EMULATOR_LINK_FLAGS_LIST UNIX_COMMAND "${EMULATOR_LINK_FLAGS}") +add_custom_command(OUTPUT ${EMULATOR_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${SOURCE_OBJ_FILE_EMU} ${EMULATOR_LINK_FLAGS_LIST} -o ${CMAKE_BINARY_DIR}/${EMULATOR_TARGET} + DEPENDS ${SOURCE_OBJ_FILE_EMU} create_library_archive) + +# fpga +set(SOURCE_OBJ_FILE_FPGA ${SOURCE_FILE}.fpga.o) +add_custom_target(fpga DEPENDS ${FPGA_TARGET}) +separate_arguments(HARDWARE_COMPILE_FLAGS_LIST UNIX_COMMAND "${HARDWARE_COMPILE_FLAGS}") +add_custom_command(OUTPUT ${SOURCE_OBJ_FILE_FPGA} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${SOURCE_FILE} ${HARDWARE_COMPILE_FLAGS_LIST} -c -o ${SOURCE_OBJ_FILE_FPGA} + DEPENDS ${SOURCE_FILE} ${HEADER_FILE}) +separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}") +add_custom_command(OUTPUT ${FPGA_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${SOURCE_OBJ_FILE_FPGA} ${HARDWARE_LINK_FLAGS_LIST} -o ${CMAKE_BINARY_DIR}/${FPGA_TARGET} + DEPENDS ${SOURCE_OBJ_FILE_FPGA} create_library_archive) + +# report +set(SOURCE_OBJ_FILE_REPORT ${SOURCE_FILE}.report.o) +add_custom_target(report DEPENDS ${REPORT_TARGET}) +add_custom_command(OUTPUT ${SOURCE_OBJ_FILE_REPORT} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${SOURCE_FILE} ${HARDWARE_COMPILE_FLAGS_LIST} -c -o ${SOURCE_OBJ_FILE_REPORT} + DEPENDS ${SOURCE_FILE} ${HEADER_FILE}) +add_custom_command(OUTPUT ${REPORT_TARGET} + COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${SOURCE_OBJ_FILE_REPORT} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link -o ${CMAKE_BINARY_DIR}/${REPORT_TARGET} + DEPENDS ${SOURCE_OBJ_FILE_REPORT} create_library_archive) + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${HEADER_FILE} ${HEADER_FILE} COPYONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${HLS_SOURCE} ${HLS_SOURCE} COPYONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${OCL_SOURCE} ${OCL_SOURCE} COPYONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SYCL_SOURCE} ${SYCL_SOURCE} COPYONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${RTL_SPEC} ${RTL_SPEC} COPYONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${RTL_C_MODEL} ${RTL_C_MODEL} COPYONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${RTL_V} ${RTL_V} COPYONLY) + +# run +add_custom_target(run + COMMAND ../${TARGET_NAME}_emu.fpga + DEPENDS ${TARGET_NAME}_emu.fpga) diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib.hpp b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib.hpp new file mode 100755 index 0000000000..968b1139c4 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib.hpp @@ -0,0 +1,9 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +SYCL_EXTERNAL float HlsSqrtf(float); +SYCL_EXTERNAL extern "C" float OclSquare(float); +SYCL_EXTERNAL float SyclSquare(float); +SYCL_EXTERNAL extern "C" unsigned RtlByteswap(unsigned x); diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_hls.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_hls.cpp new file mode 100755 index 0000000000..7e488a1271 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_hls.cpp @@ -0,0 +1,7 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include "HLS/math.h" +float HlsSqrtf(float x) { return sqrtf(x); } diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_ocl.cl b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_ocl.cl new file mode 100755 index 0000000000..bf2a1c4930 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_ocl.cl @@ -0,0 +1,6 @@ +//============================================================== +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +float OclSquare(float x) { return x * x; } diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl.v b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl.v new file mode 100755 index 0000000000..28c1ad0f96 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl.v @@ -0,0 +1,18 @@ +`timescale 1 ps / 1 ps + +module byteswap_uint ( + input clock, + input resetn, + input ivalid, + input iready, + output ovalid, + output oready, + input [31:0] datain, + output [31:0] dataout); + + assign ovalid = 1'b1; + assign oready = 1'b1; + // clk, ivalid, iready, resetn are ignored + assign dataout = {datain[15:0], datain[31:16]}; + +endmodule diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl_model.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl_model.cpp new file mode 100755 index 0000000000..1c74a74b8b --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl_model.cpp @@ -0,0 +1,6 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +extern "C" unsigned RtlByteswap(unsigned x) { return x << 16 | x >> 16; } diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl_spec.xml b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl_spec.xml new file mode 100755 index 0000000000..361ef11e8a --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl_spec.xml @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_sycl.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_sycl.cpp new file mode 100755 index 0000000000..dcda51b31d --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_sycl.cpp @@ -0,0 +1,7 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +SYCL_EXTERNAL float SyclSquare(float x) { return x * x; } diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/use_library.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/use_library.cpp new file mode 100755 index 0000000000..6af7f26437 --- /dev/null +++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/use_library.cpp @@ -0,0 +1,89 @@ +//============================================================== +// Copyright Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +#include +#include +#include "dpc_common.hpp" +#include "lib.hpp" + +using namespace sycl; + +// Values used as input to the kernel +constexpr float kA = 2.0f; +constexpr float kB = 3.0f; + +// Forward declaration of the kernel name +// (This will become unnecessary in a future compiler version.) +class KernelCompute; + +int main() { + unsigned result = 0; + + // Select either the FPGA emulator (CPU) or FPGA device +#if defined(FPGA_EMULATOR) + intel::fpga_emulator_selector device_selector; +#else + intel::fpga_selector device_selector; +#endif + + try { + queue q(device_selector, dpc_common::exception_handler); + + // The scalar inputs are passed to the kernel using the lambda capture, + // but a SYCL buffer must be used to return a scalar from the kernel. + buffer buffer_c(&result, 1); + + q.submit([&](handler &h) { + + // Accessor to the scalar result + auto accessor_c = buffer_c.get_access(h); + + // Kernel + h.single_task([=]() { + + // OclSquare is an OpenCL function, defined in lib_ocl.cl. + float a_sq = OclSquare(kA); + + // HlsSqrtf is an Intel HLS component, defined in lib_hls.cpp. + // (Intel HLS is a C++ based High Level Synthesis language for FPGA.) + float a_sq_sqrt = HlsSqrtf(a_sq); + + // SyclSquare is a SYCL library function, defined in lib_sycl.cpp. + float b_sq = SyclSquare(kB); + + // RtlByteswap is an RTL library. + // - When compiled for FPGA, Verilog module byteswap_uint in lib_rtl.v + // is instantiated in the datapath by the compiler. + // - When compiled for FPGA emulator (CPU), the C model of RtlByteSwap + // in lib_rtl_model.cpp is used instead. + accessor_c[0] = RtlByteswap((unsigned)(a_sq_sqrt + b_sq)); + }); + }); + } catch (sycl::exception const &e) { + // Catches exceptions in the host code + std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n"; + + // Most likely the runtime couldn't find FPGA hardware! + if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) { + std::cout << "If you are targeting an FPGA, please ensure that your " + "system has a correctly configured FPGA board.\n"; + std::cout << "If you are targeting the FPGA emulator, compile with " + "-DFPGA_EMULATOR.\n"; + } + std::terminate(); + } + + // Compute the expected "golden" result + unsigned gold = sqrt(kA * kA) + (kB * kB); + gold = gold << 16 | gold >> 16; + + // Check the results + if (result != gold) { + std::cout << "FAILED: result is incorrect!\n"; + return -1; + } + std::cout << "PASSED: result is correct!\n"; + return 0; +} diff --git a/Libraries/oneDAL/License.txt b/Libraries/oneDAL/License.txt new file mode 100644 index 0000000000..a3ab05efce --- /dev/null +++ b/Libraries/oneDAL/License.txt @@ -0,0 +1,8 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +© 2020 GitHub, Inc. \ No newline at end of file diff --git a/Libraries/oneDAL/daal4py_Distributed_Kmeans/License.txt b/Libraries/oneDAL/daal4py_Distributed_Kmeans/License.txt new file mode 100755 index 0000000000..a3ab05efce --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_Kmeans/License.txt @@ -0,0 +1,8 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +© 2020 GitHub, Inc. \ No newline at end of file diff --git a/Libraries/oneDAL/daal4py_Distributed_Kmeans/README.md b/Libraries/oneDAL/daal4py_Distributed_Kmeans/README.md new file mode 100755 index 0000000000..208d4a7dfe --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_Kmeans/README.md @@ -0,0 +1,112 @@ +# daal4py Distributed K-Means +This sample code shows how to train and predict with a distributed k-means model using the python API package daal4py for oneAPI Data Analytics Library. It assumes you have a working version of MPI library installed and it demonstrates how to use software products that can be found in the [Intel oneAPI Data Analytics Library](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onedal.html) or [Intel AI Analytics Toolkit powered by oneAPI](https://software.intel.com/content/www/us/en/develop/tools/oneapi/ai-analytics-toolkit.html). + +| Optimized for | Description +| :--- | :--- +| OS | 64-bit Linux: Ubuntu 18.04 or higher, 64-bit Windows 10, macOS 10.14 or higher +| Hardware | Intel Atom® Processors; Intel® Core™ Processor Family; Intel® Xeon® Processor Family; Intel® Xeon® Scalable Performance Processor Family +| Software | oneDAL Software Library, Python version 2.7 or >= 3.6, conda-build version >= 3, C++ compiler with C++11 support, Pickle, Pandas, NumPy +| What you will learn | distributed oneDAL K-Means programming model for Intel CPU +| Time to complete | 5 minutes + +## Purpose + +daal4py is a simplified API to Intel® DAAL that allows for fast usage of the framework suited for Data Scientists or Machine Learning users. Built to help provide an abstraction to Intel® DAAL for either direct usage or integration into one's own framework. + +In this sample you will run a distributed K-Means model with oneDAL daal4py library memory objects. You will also learn how to train a model and save the information to a file. + +## Key Implementation Details +This distributed K-means sample code is implemented for CPU using the Python language. The example assumes you have daal4py and scikit-learn installed inside a conda environment, similar to what is delivered with the installation of the Intel(R) Distribution for Python as part of the [oneAPI AI Analytics Toolkit powered by oneAPI](https://software.intel.com/en-us/oneapi/ai-kit). + +## Additional Requirements +You will need a working MPI library. We recommend to use Intel(R) MPI, which is included in the [oneAPI HPC Toolkit](https://software.intel.com/en-us/oneapi/hpc-kit). + +## License +This code sample is licensed under MIT license + +## Building daal4py for CPU + +oneAPI Data Analytics Library is ready for use once you finish the Intel AI Analytics Toolkit installation, and have run the post installation script. + +You can refer to the oneAPI [main page](https://software.intel.com/en-us/oneapi) for toolkit installation, and the Toolkit [Getting Started Guide for Linux](https://software.intel.com/en-us/get-started-with-intel-oneapi-linux-get-started-with-the-intel-ai-analytics-toolkit) for post-installation steps and scripts. + +### Activate conda environment With Root Access + +Please follow the Getting Started Guide steps (above) to set up your oneAPI environment with the setvars.sh script. Then navigate in linux shell to your oneapi installation path, typically `~/intel/inteloneapi`. Intel Python environment will be activte by default. However, if you activated another environment, you can return with the following command: + +#### On a Linux* System +``` +source activate base +``` + +### Activate conda environment Without Root Access (Optional) + +By default, the Intel AI Analytics toolkit is installed in the inteloneapi folder, which requires root privileges to manage it. If you would like to bypass using root access to manage your conda environment, then you can clone your desired conda environment using the following command: + +#### On a Linux* System +``` +conda create --name user_base --clone base +``` + +Then activate your conda environment with the following command: + +``` +source activate user_base +``` + +### Install Jupyter Notebook +``` +conda install jupyter nb_conda_kernels +``` + + +#### View in Jupyter Notebook + +_Note: This distributed execution cannot be launched from the jupyter notebook version, but you can still view inside the notebook to follow the included write-up and description._ + +Launch Jupyter Notebook in the directory housing the code example + +``` +jupyter notebook +``` + +### Running the Sample as a Python File + +When using daal4py for distributed memory systems, the command needed to execute the program should be executed in a bash shell. To execute this example, run the following command, where the number **4** is chosen as an example and means that it will run on **4 processes**: + +Run the Program + +`mpirun -n 4 python ./daal4py_Distributed_Kmeans.py` + +The output of the script will be saved in the included models and results directories. + +_Note: This code samples focuses on how to use daal4py to do distributed ML computations on chunks of data. The `mpirun` command above will only run on single local node. In order to launch on a cluster, you will need to create a host file on the master node among other steps. The **TensorFlow_Multinode_Training_with_Horovod** code sample explains this process well._ + +##### Expected Printed Output (with similar numbers, printed 4 times): +``` + + +Here our centroids: + + + [[ 5.46000000e+02 -3.26170648e+00 -6.15922494e+00] + [ 1.80000000e+01 -1.00432059e+01 -8.38198798e+00] + [ 4.10000000e+02 3.78330964e-01 8.29073839e+00]] + +Here is our centroids loaded from file: + + [[ 5.46000000e+02 -3.26170648e+00 -6.15922494e+00] + [ 1.80000000e+01 -1.00432059e+01 -8.38198798e+00] + [ 4.10000000e+02 3.78330964e-01 8.29073839e+00]] +Here is our cluster assignments for first 5 datapoints: + + [[1] + [1] + [1] + [1] + [1]] +[CODE_SAMPLE_COMPLETED_SUCCESFULLY] + +``` + + diff --git a/Libraries/oneDAL/daal4py_Distributed_Kmeans/daal4py_Distributed_Kmeans.ipynb b/Libraries/oneDAL/daal4py_Distributed_Kmeans/daal4py_Distributed_Kmeans.ipynb new file mode 100755 index 0000000000..8d245508dc --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_Kmeans/daal4py_Distributed_Kmeans.ipynb @@ -0,0 +1,254 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# =============================================================\n", + "# Copyright © 2020 Intel Corporation\n", + "# \n", + "# SPDX-License-Identifier: MIT\n", + "# =============================================================" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Daal4py K-Means Clustering Example for Distributed Memory Systems [SPMD mode]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## IMPORTANT NOTICE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When using daal4py for distributed memory systems, the command needed to execute the program should be **executed \n", + "in a bash shell**. In order to run this example, please download it as a .py file then run the following command (**the number 4 means that it will run on 4 processes**):" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "mpirun -n 4 python ./daal4py_Distributed_Kmeans.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Importing and Organizing Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example we will be using K-Means clustering to **initialize centroids** and then **use them to cluster the synthetic dataset.**\n", + "\n", + "Let's start by **importing** all necessary data and packages." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "##### daal4py K-Means Clustering example for Distributed Memory Systems [SPMD Mode] #####\n", + "import daal4py as d4p\n", + "import pickle\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's **load** in the dataset and **organize** it as necessary to work with our model. For distributed, every file has a unique ID.\n", + "\n", + "We will also **initialize the distribution engine**." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "d4p.daalinit() #initializes the distribution engine\n", + "\n", + "# organizing variables used in the model for prediction\n", + "# each process gets its own data\n", + "infile = \"./data/distributed_data/daal4py_Distributed_Kmeans_\" + str(d4p.my_procid()+1) + \".csv\"\n", + "\n", + "# read data\n", + "X = pd.read_csv(infile)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Computing and Saving Initial Centroids" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Time to **initialize our centroids!**" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# computing inital centroids\n", + "init_result = d4p.kmeans_init(nClusters = 3, method = \"plusPlusDense\").compute(X)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To **get initial centroid information and save it** to a file:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hers is our centroids:\n", + "\n", + "\n", + " [[ 5.46000000e+02 -4.95417384e-01 8.83354904e+00]\n", + " [ 1.80000000e+01 -4.12886224e+00 -7.35426095e+00]\n", + " [ 4.11000000e+02 -3.27940151e+00 -6.22280477e+00]] \n", + "\n" + ] + } + ], + "source": [ + "# retrieving and printing inital centroids\n", + "centroids = init_result.centroids\n", + "print(\"Here's our centroids:\\n\\n\\n\", centroids, \"\\n\")\n", + "\n", + "centroids_filename = './models/kmeans_clustering_initcentroids_'+ str(d4p.my_procid()+1) + '.csv'\n", + "\n", + "# saving centroids to a file\n", + "pickle.dump(centroids, open(centroids_filename, \"wb\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's **load up the centroids** and look at them." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Here is our centroids loaded from file:\n", + "\n", + " [[ 5.46000000e+02 -4.95417384e-01 8.83354904e+00]\n", + " [ 1.80000000e+01 -4.12886224e+00 -7.35426095e+00]\n", + " [ 4.11000000e+02 -3.27940151e+00 -6.22280477e+00]]\n" + ] + } + ], + "source": [ + "# loading the initial centroids from a file\n", + "loaded_centroids = pickle.load(open(centroids_filename, \"rb\"))\n", + "print(\"Here is our centroids loaded from file:\\n\\n\",loaded_centroids)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Assign The Data to Clusters and Save The Results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's **assign the data** to clusters." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# compute the clusters/centroids\n", + "kmeans_result = d4p.kmeans(nClusters = 3, maxIterations = 5, assignFlag = True).compute(X, init_result.centroids)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To **get Kmeans result objects** (assignments, centroids, goalFunction [deprecated], nIterations, and objectiveFunction):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# retrieving and printing cluster assignments\n", + "assignments = kmeans_result.assignments\n", + "print(\"Here is our cluster assignments for first 5 datapoints: \\n\\n\", assignments[:5])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Libraries/oneDAL/daal4py_Distributed_Kmeans/daal4py_Distributed_Kmeans.py b/Libraries/oneDAL/daal4py_Distributed_Kmeans/daal4py_Distributed_Kmeans.py new file mode 100755 index 0000000000..611abc988c --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_Kmeans/daal4py_Distributed_Kmeans.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: + + +''' +============================================================= +Copyright © 2020 Intel Corporation + +SPDX-License-Identifier: MIT +============================================================= +''' + +# # Daal4py K-Means Clustering Example for Distributed Memory Systems [SPMD mode] + +# ## IMPORTANT NOTICE + +# When using daal4py for distributed memory systems, the command needed to execute the program should be **executed +# in a bash shell**. In order to run this example, please download it as a .py file then run the following command (**the number 4 means that it will run on 4 processes**): + +# mpirun -n 4 python ./daal4py_Distributed_Kmeans.py + +# ## Importing and Organizing Data + +# In this example we will be using K-Means clustering to **initialize centroids** and then **use them to cluster the synthetic dataset.** +# +# Let's start by **importing** all necessary data and packages. + +# In[2]: + + +##### daal4py K-Means Clustering example for Distributed Memory Systems [SPMD Mode] ##### +import daal4py as d4p +import pickle +import pandas as pd +import numpy as np + + +# Now let's **load** in the dataset and **organize** it as necessary to work with our model. For distributed, every file has a unique ID. +# +# We will also **initialize the distribution engine**. + +# In[3]: + + +d4p.daalinit() #initializes the distribution engine + +# organizing variables used in the model for prediction +# each process gets its own data +infile = "./data/distributed_data/daal4py_Distributed_Kmeans_" + str(d4p.my_procid()+1) + ".csv" + +# read data +X = pd.read_csv(infile) + + +# ## Computing and Saving Initial Centroids + +# Time to **initialize our centroids!** + +# In[4]: + + +# computing inital centroids +init_result = d4p.kmeans_init(nClusters = 3, method = "plusPlusDense").compute(X) + + +# To **get initial centroid information and save it** to a file: + +# In[5]: + + +# retrieving and printing inital centroids +centroids = init_result.centroids +print("Here our centroids:\n\n\n", centroids, "\n") + +centroids_filename = './models/kmeans_clustering_initcentroids_'+ str(d4p.my_procid()+1) + '.csv' + +# saving centroids to a file +pickle.dump(centroids, open(centroids_filename, "wb")) + + +# Now let's **load up the centroids** and look at them. + +# In[6]: + + +# loading the initial centroids from a file +loaded_centroids = pickle.load(open(centroids_filename, "rb")) +print("Here is our centroids loaded from file:\n\n",loaded_centroids) + + +# # Assign The Data to Clusters and Save The Results + +# Let's **assign the data** to clusters. + +# In[7]: + + +# compute the clusters/centroids +kmeans_result = d4p.kmeans(nClusters = 3, maxIterations = 5, assignFlag = True).compute(X, init_result.centroids) + + +# To **get Kmeans result objects** (assignments, centroids, goalFunction [deprecated], nIterations, and objectiveFunction): + +# In[8]: + + +# retrieving and printing cluster assignments +assignments = kmeans_result.assignments +print("Here is our cluster assignments for first 5 datapoints: \n\n", assignments[:5]) + + +# Now let's **export the cluster assignments** to a **CSV file**. We will also **stop the distribution engine.** + +# In[9]: + + +# now export the results to a CSV file +results_filename = "./results/daal4py_Distributed_Kmeans_results_" + str(d4p.my_procid()+1) + ".csv" +np.savetxt(results_filename, assignments, delimiter=",") + +d4p.daalfini() # stops the distribution engine +print('[CODE_SAMPLE_COMPLETED_SUCCESFULLY]') + diff --git a/Libraries/oneDAL/daal4py_Distributed_Kmeans/data/distributed_data/daal4py_Distributed_Kmeans_1.csv b/Libraries/oneDAL/daal4py_Distributed_Kmeans/data/distributed_data/daal4py_Distributed_Kmeans_1.csv new file mode 100755 index 0000000000..7f45cc383f --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_Kmeans/data/distributed_data/daal4py_Distributed_Kmeans_1.csv @@ -0,0 +1,601 @@ +,0,1 +0,1.8723965807238774,9.423076509081708 +1,2.3039829051048324,9.018580186635331 +2,1.026073451114729,8.077031073995432 +3,-8.541605432789838,-8.00341797317489 +4,-2.369401173189912,-6.975984734090902 +5,2.5806063684707987,10.217573085425613 +6,-9.284790097193767,-7.4097633737311375 +7,-3.522989397130624,-6.613010192479875 +8,1.027142105078586,8.80052166519931 +9,-3.2697078620655873,-5.893962453251839 +10,-8.885013378341931,-7.333503638663275 +11,-2.0396671351470204,-7.044169599415981 +12,-4.597620364286053,-7.453330250170903 +13,-6.456202788660033,-7.212171290757356 +14,1.4445918585874757,7.142250586300567 +15,2.1173329239322483,7.70433584845831 +16,-5.4802518839548515,-6.079679059100007 +17,-2.244797527183411,-7.0731238440633675 +18,-4.128862239659543,-7.354260953773871 +19,1.6884289800429058,9.819116848158599 +20,-8.610019349301364,-7.939921364213658 +21,-9.056524840424752,-8.6211365869879 +22,3.3059953897609953,7.8815868414717425 +23,-4.080336467441554,-6.722207807125867 +24,-2.6893506939487377,-5.168503633974499 +25,-3.3282809849682,-7.379647324555367 +26,1.5129512034685555,8.577779572751652 +27,-10.115832037074668,-6.748848596820765 +28,3.6388808782426136,9.185890863280441 +29,-3.00028100453964,-5.969928023226169 +30,-9.93319036402656,-5.786793165788258 +31,1.1090324024414029,9.291161672490887 +32,-2.8613517909224715,-5.354351487416557 +33,1.0291251821957932,7.976676963358322 +34,2.9700930258076887,9.467411686683057 +35,-8.168821654492318,-5.790117115599025 +36,-4.145244396361411,-6.28233426294262 +37,-4.707198834288474,-5.984294898402102 +38,-10.54314968159905,-6.521137980753572 +39,-2.8107961027624766,-7.600863870998378 +40,-8.343541203130618,-7.5235346431712875 +41,-3.561265069901654,-6.47598120139939 +42,1.602214674861818,9.914325233595067 +43,3.350406430966756,11.289498620860265 +44,-6.890126059113473,-6.852073175637249 +45,-8.926423471428778,-7.727377511846929 +46,0.8164245936495235,8.916619307728906 +47,1.97328080691373,8.403853043582245 +48,-3.348392512406978,-7.97108628676181 +49,-9.020767109684183,-6.546889799655273 +50,-8.833926198022965,-7.428422280319241 +51,2.5346141339862474,9.65332832411909 +52,-4.434886357325489,-7.066557876144199 +53,2.8120211842088287,9.696878530991425 +54,-2.0128979493397274,-7.269013395654566 +55,-5.168355005107381,-4.699037335501423 +56,-2.7036500461294777,-5.5151627803873655 +57,1.9776778941634992,8.919828769286113 +58,-10.340388439013493,-5.786781689624086 +59,-5.101517295194648,-4.830476798613082 +60,-4.800994153532466,-3.3894922643890757 +61,-8.470818425322038,-6.416551078654722 +62,1.2163842111707046,8.240738678586231 +63,-7.573810331519125,-5.989336620836275 +64,-8.229867526485254,-7.193233970728691 +65,-4.901514626307628,-7.585912020677485 +66,-9.231559142391077,-9.19746931272156 +67,-1.9230752736805574,-5.506535847318018 +68,-9.437917519275558,-7.826778197496669 +69,-7.659160480421658,-6.528634381080725 +70,-3.6438217631687175,-6.8571324102789 +71,-4.9559928341827435,-5.264716327780803 +72,-3.6526387444195936,-5.61501979239173 +73,-10.73660232270179,-9.618413219025292 +74,-7.850988109207649,-7.689798105008403 +75,0.8701242424569892,10.293793647904694 +76,-6.335238230849191,-8.227788204093063 +77,-2.052319669530889,-7.376260216839449 +78,-7.065371800261501,-7.653736489908707 +79,-4.869972297466505,-5.80687913084216 +80,0.9551076290473562,8.600680899343779 +81,1.61798032639464,9.514230838946418 +82,1.290303039005867,8.874762993091842 +83,1.7169297196710165,7.722133429760188 +84,-8.97140030945659,-9.953721246636322 +85,2.9441411029062454,10.073257625162373 +86,-8.562882804324602,-7.806641777045539 +87,2.415995112048434,9.36434699718389 +88,-4.56418486735835,-7.562291899884148 +89,-3.3726386527952292,-5.439598282833402 +90,-6.831168562244628,-8.563364402768862 +91,1.394409575533413,9.279097027932696 +92,-8.488381038275751,-6.724898042083206 +93,-9.504913246191423,-8.300327567851 +94,-6.948849104914956,-6.521919194615732 +95,-9.352569270247198,-7.774503943661293 +96,-8.948787065859575,-9.07106742804278 +97,-8.888113050595429,-6.3873028327127805 +98,2.4763833460431126,9.240278530822199 +99,-9.313632569339555,-5.737661563703921 +100,-7.7346131143536905,-7.821933935118758 +101,2.2174107493455897,9.652396968167826 +102,-2.689779361042282,-7.386373883622923 +103,-8.46279863033734,-7.90876135235025 +104,-10.291976305929914,-8.593599798344943 +105,-10.100955971386144,-5.991161008843616 +106,0.43397101161108087,10.232991025840189 +107,0.46895105516566526,8.378385146013642 +108,-7.373252885214495,-6.825063639557488 +109,-0.130378027403375,6.9190464066546715 +110,-4.020200293578918,-6.430029396302369 +111,-0.08921309008431444,8.191850705666758 +112,-3.8653134420514803,-5.4694094235747 +113,1.628227842220048,9.309015751656906 +114,-9.594737733986559,-9.003997363347098 +115,-4.769281807154009,-8.417076408378273 +116,1.6647750920863866,9.955257929870456 +117,-4.419150978955475,-5.539302132893461 +118,-8.566286257143563,-8.111681850177206 +119,-4.960922739518262,-6.5069242911263565 +120,-8.845626011494295,-5.780985990719593 +121,-3.803368204368129,-7.098085700545766 +122,-0.10628537209794042,8.653072759332863 +123,-4.389175840066328,-5.728288748906668 +124,-8.843339430157748,-6.307656890829538 +125,-3.7763663979968274,-6.772988814075323 +126,-1.8945877782961802,-5.67635112725773 +127,0.03130878326482467,9.65962901724951 +128,-8.934480603545063,-7.667867555390796 +129,-3.1632449424398255,-5.781542666823375 +130,-8.698422364088024,-6.923785911172841 +131,-3.438697704586029,-8.048713246022794 +132,-3.635278143631961,-7.391709231534364 +133,1.4878816870410658,10.434460320285321 +134,0.33919524795335887,8.360303461972801 +135,-2.912412623932284,-7.8874631834968305 +136,1.812496486099853,9.607077515538352 +137,0.6232376545308436,7.792875411524995 +138,-2.557308368694793,-5.6452808794139955 +139,-4.034444015689002,-9.090709842422477 +140,-5.5278062005678015,-5.861445423122923 +141,-2.80822541498089,-7.153794823046844 +142,-2.580346473036328,-6.956940028675239 +143,-9.031388716432902,-8.725501431311354 +144,-8.711556977947783,-7.0166882679856855 +145,-3.6924472044053593,-5.979849731209238 +146,-2.400875296282025,-7.3654860474319435 +147,-9.502494612938754,-6.882389220685811 +148,-8.383355689087542,-8.517502105879297 +149,1.6891420003666533,8.24934183280459 +150,2.740338881826161,9.545020463701205 +151,-4.247441724425719,-6.211270016948834 +152,0.9847351521480718,6.9708198696061485 +153,2.4114607797001315,8.73350761238257 +154,1.2320784072906812,8.97897642364675 +155,-2.1393920629714054,-5.057791738024882 +156,2.3671832737262184,8.46375484267337 +157,-9.74988460698268,-7.482378149268028 +158,2.3000007665895907,10.721851619443242 +159,-5.50780281880151,-6.428946324501107 +160,-2.897479913080972,-5.172777184331416 +161,-0.657999396336646,9.666995011660223 +162,-3.378063010317772,-6.323195949284838 +163,3.715349030367662,8.079777142112702 +164,-2.5346022348849875,-6.554414611933407 +165,-3.7261095610540207,-6.525586577496511 +166,-4.400979092409175,-5.482685404964332 +167,-5.183156574865519,-4.94726433336997 +168,1.7362635501608856,8.196687567472695 +169,2.3972014974539775,10.113895861761995 +170,-9.632491326549443,-8.017260707708173 +171,-0.5594012154597015,7.604364100618122 +172,2.3416747811127054,7.006744152775529 +173,0.8769675389837549,8.631891224545527 +174,-2.9879989464742245,-6.616673149083951 +175,-11.48989377544951,-6.116923703694228 +176,-7.043559206169571,-7.389109142947291 +177,3.949051276063594,7.963460398342924 +178,-8.922561846082933,-6.752903793551254 +179,-8.430183397712431,-8.504082904889342 +180,0.6245943758437478,8.199459766621962 +181,-5.780096953221933,-6.675029748733625 +182,-8.637738748456233,-6.569969041154689 +183,-8.951838895375104,-8.051232000315483 +184,-9.626070914197353,-6.264628060914582 +185,-9.145990202367672,-7.406681887093229 +186,-3.434801052362838,-6.187579768673667 +187,1.665414669161203,9.31199325225241 +188,1.9857290962543637,8.676310707858823 +189,-9.14129259344747,-7.564598304723135 +190,1.7803979810077042,9.265052880657668 +191,0.4667425063754851,7.805511391561872 +192,1.0665242623902753,8.781668124487213 +193,0.9859804989464445,9.33935533880398 +194,-4.224897569796012,-5.301022974691385 +195,2.180032721568203,8.556214069490203 +196,-4.363950110086683,-6.887267518229789 +197,-5.345620360032975,-5.345902231515946 +198,-9.873822668160358,-8.014234349652336 +199,1.8726679327582603,10.177105298248879 +200,1.5628991025595353,9.56172143949572 +201,1.0929361214337034,7.945435252037524 +202,1.817975788982048,9.973506416969428 +203,2.3421387813360517,5.507999588746552 +204,-8.545495510460189,-7.839907819878756 +205,-1.834346050272757,-7.334320951620844 +206,-8.7811793481458,-8.885832599647916 +207,3.3101922716031003,9.962395815306802 +208,0.7677167907391013,7.754761245418569 +209,-2.453242944764968,-8.345166172672885 +210,-2.1674273959241184,-5.6657351714780875 +211,3.129009590443727,7.285738404952774 +212,-4.390632916615479,-6.8399659983240095 +213,-8.70362515574467,-8.113278417832024 +214,-8.185032590903454,-8.245519546204777 +215,-8.554265650421904,-6.354471258905091 +216,-3.6238109363554485,-7.990631339289265 +217,-8.422208256528265,-6.237949347003386 +218,-9.062556919549863,-9.185027910001303 +219,-11.498894787253768,-6.882960931814834 +220,-10.275788114535857,-7.944099185829921 +221,1.7095239756030542,9.596021419978454 +222,4.030894272872818,8.587812385034185 +223,-9.317187475710424,-9.439334686580088 +224,-3.6530939607148105,-6.970908550723431 +225,1.651465790229752,9.927927019815462 +226,2.0700898465438335,9.64101532767761 +227,-4.146806648249522,-6.583169244052819 +228,-9.41873849787954,-6.296965466311001 +229,2.75105345921606,8.63533599691335 +230,-7.117352937226269,-8.354981611155464 +231,-4.113647867447832,-7.039554319326716 +232,-3.9940376887494233,-6.790554267856149 +233,-4.876577053923553,-5.290597175632066 +234,0.9835666371302964,8.301891192543446 +235,-3.4038342312836707,-9.032867323830587 +236,-4.141554979311696,-7.967736281926432 +237,-8.617460702681221,-6.628626633657286 +238,1.403168461357089,8.98081385091177 +239,1.2093458294649204,9.658775773256561 +240,-4.055100012797214,-5.294792682973709 +241,-2.783087065823997,-7.77376498017402 +242,-8.628549162973693,-4.9787131649897125 +243,0.7151568906878767,9.180827710651107 +244,-8.522317902226503,-7.285706505930886 +245,-8.487428757558858,-7.024641274080103 +246,-4.1178652200658314,-7.396756418455362 +247,1.7447684534115906,9.031670566483196 +248,-9.310081107059657,-7.234580417715351 +249,1.9332101560563706,8.007104563168681 +250,1.7310352269945617,7.384783889660245 +251,1.7947059994721024,9.882532713018065 +252,-7.289414765548567,-10.012923571907196 +253,-6.990409331327357,-7.133331998874008 +254,1.5316843670219304,6.587370791652179 +255,-8.810652853340734,-7.740094173343104 +256,-5.308610848170018,-7.995193978631127 +257,-4.50146448126074,-5.685088920689693 +258,-8.207486482202814,-8.232203072496342 +259,-3.2395098150180077,-7.889312916269965 +260,0.19520339576906864,8.684534892712938 +261,1.0810456300141347,9.895907121450538 +262,-10.241820115759646,-8.4574144650483 +263,1.4843270015157928,10.48844276076728 +264,-2.0528995769229246,-5.109246700530515 +265,-2.80602875067267,-7.116607906192952 +266,3.5303986412736306,9.16153512008025 +267,1.1871221323757888,9.649626296305442 +268,1.1662609737079332,7.753907332737844 +269,0.7520200489476977,7.953333506449035 +270,1.6157852796571222,8.57244834025245 +271,-3.8069406639093133,-7.425164023262084 +272,-2.868721951203068,-7.394562784992771 +273,-3.939070909536578,-6.813736857807444 +274,1.64420639444631,8.873192455820949 +275,-8.274472346059143,-6.637532536181894 +276,0.5714796608788868,8.018913195681947 +277,-3.827618940828862,-8.38233220367727 +278,-0.6961214572120009,7.678277612948944 +279,-3.543635462461846,-4.725334508934144 +280,-3.1822698131257106,-7.192578829145819 +281,-3.7129501936056197,-8.650556086913245 +282,-2.9029511162275186,-7.3689026849263755 +283,0.3851146360298001,9.836714477121308 +284,-6.8909485133434,-7.558780860440229 +285,-7.980234843184486,-6.322261578874826 +286,-8.931259986955775,-8.450579870297657 +287,1.6737535480317962,8.65024853655783 +288,0.826709738075403,8.09805941812536 +289,-9.302631974352856,-9.0454586149317 +290,-4.908765883636813,-7.065364826691322 +291,0.9272482963727093,9.151744299047177 +292,-4.704891488369021,-5.223417114033953 +293,-8.66128930295533,-9.421322598638117 +294,-2.3317547848791245,-7.277325144699796 +295,-10.113162936059052,-5.52573503696392 +296,-3.1446284099964332,-4.677418598405924 +297,-2.743809044607817,-8.05953682348741 +298,-10.220236959264621,-7.02780678286387 +299,-4.915008120364791,-5.091923121986243 +300,3.0639853629328946,9.549426153038652 +301,-7.208099932433294,-5.75728435160728 +302,-3.5252647247033626,-6.052245305030447 +303,-7.686145068033298,-6.062628295638922 +304,-3.1726651779710164,-5.228661146280547 +305,1.2982626763148466,9.87163188584001 +306,-8.471380564558972,-7.856583039077515 +307,-2.4194243830958095,-5.25583697706461 +308,-6.681092123226286,-7.612664207772949 +309,-9.125540384037064,-8.549585667559677 +310,3.0320588919262574,7.831240489969865 +311,-8.221064029858395,-7.236701215350069 +312,1.2412950888061665,10.017138742473664 +313,-9.535046634147104,-8.769890960265467 +314,-3.408398280085348,-5.96382974621914 +315,0.5413432601850015,9.097822363391515 +316,2.763691787338296,9.063179309030534 +317,0.9505229335340074,8.862938140668978 +318,-2.5189564601842322,-7.826025922618833 +319,1.4504727139845266,7.364710123749552 +320,-3.7735925506268915,-7.65494713405503 +321,1.4250384640172251,8.762099855880837 +322,3.160693347229116,7.6738226049792235 +323,1.7717731821304106,8.650182596558299 +324,-10.071016493767532,-7.337461506157067 +325,-8.55532715752027,-7.666000790269656 +326,-3.2204015351444726,-6.391362446736852 +327,-10.264066074526038,-8.27713867477806 +328,-8.46885778216139,-8.101892160300661 +329,-9.404541129281325,-8.21904518562134 +330,-4.072917331149152,-7.338156678245081 +331,-3.696902662518048,-8.11808079390431 +332,2.141842893029363,9.779204635114384 +333,1.4972505342845004,8.273713710336331 +334,-8.846787937810072,-9.02730383684036 +335,1.7660568824918503,8.1418782533338 +336,-9.651176645541238,-7.4042749213795895 +337,-3.606196666260453,-5.265726859015916 +338,2.828827773136019,9.487546474030562 +339,-4.400891371149478,-5.5457711253957385 +340,-2.307384358733934,-7.348399475401877 +341,1.1715850294245174,10.345182298603854 +342,1.4223996598626933,8.37771842373096 +343,1.451342637658223,9.826895261036695 +344,-10.132928887943466,-7.747290072161744 +345,-10.077173192464576,-7.7009641307717445 +346,-2.7403945790533495,-6.691107926961068 +347,-3.685188266257063,-7.118394980068402 +348,1.4297614168899035,8.991190711234426 +349,0.9699181835479108,10.008555079871995 +350,1.0370607398720144,9.630783542637518 +351,-2.752474877022182,-6.595211929514176 +352,1.7283514396945507,8.46295371511547 +353,2.347624499061898,9.269667488789345 +354,-3.1991203481039188,-7.586030234867592 +355,-10.702953880465959,-6.064055420015948 +356,2.139487696622963,8.192097506911837 +357,-9.802700270912315,-7.239184432887089 +358,-8.192232955020291,-8.17058960139829 +359,1.8855654851018984,7.5057242164919415 +360,-8.813873220046947,-7.472411012653497 +361,-4.129155979165331,-7.1065424150176195 +362,1.1012733349681794,10.472684478931251 +363,-9.717029748435161,-7.727844370470556 +364,-2.344091878136072,-5.280345353541628 +365,-8.782481937816403,-7.366299960389979 +366,3.8553344743880937,7.602650026832679 +367,-3.775402698272785,-5.668250996662408 +368,1.658484193833966,8.540554769258135 +369,-4.3526321806296,-7.192497620325609 +370,2.893876132796926,8.62656237208975 +371,-4.174631508186347,-7.826334613441495 +372,-10.550816260775768,-8.439343051683956 +373,1.8005486117342335,9.280398775888468 +374,1.3232385233144566,8.949975443235546 +375,1.5881939128241624,8.267200170691533 +376,-8.41536017266014,-7.728288579010116 +377,-3.4947561605515896,-7.483636021577496 +378,-0.15892687409951511,9.234260631001696 +379,-4.6823048714848925,-5.524518995949238 +380,-7.840175300256336,-7.503496406404144 +381,0.8365315425003037,8.22878104219965 +382,-8.744928882159561,-6.654946107242386 +383,-8.195667876629038,-8.196815597437658 +384,0.950055450605099,8.740084428459868 +385,1.4475900430160649,8.075076799105 +386,0.41654693130221077,8.635406961568314 +387,-9.27810291136573,-8.33645820603062 +388,-4.1615442027987894,-5.1658904153231955 +389,2.3858159594389976,8.07217645185868 +390,1.211024985945369,9.284066645806027 +391,-8.754916718713156,-8.370432633440902 +392,-9.826823645684605,-8.147552258744406 +393,0.3162752659865671,8.970021210588147 +394,1.5396120233089956,9.760003575747081 +395,0.07905807755856809,7.420249614630872 +396,-3.143881204768702,-6.214537881949882 +397,-8.170847736346687,-8.631169866939592 +398,-6.045080792096531,-6.927595217572093 +399,-7.479253588599581,-8.566301640143283 +400,-9.649688603020303,-8.274248401618797 +401,-8.403343361841443,-7.5147391259480845 +402,2.5358250807959606,8.309735399163488 +403,-3.4525140796501756,-6.637523823522272 +404,-2.572164495885186,-5.180095563618318 +405,1.9915612782471583,9.018087987152844 +406,-9.667516437002341,-5.889585605577505 +407,-8.933263844689996,-7.187094251719924 +408,-4.179465573588384,-6.155806824738958 +409,-4.98886553816402,-7.686720696788742 +410,1.544001113411751,10.154643826195707 +411,-3.279401506436028,-6.222804774858151 +412,-11.079237642121381,-8.447160410085862 +413,0.40898827425021644,9.523909932918087 +414,-7.0896159034552095,-7.375663059325421 +415,2.466951700395066,9.048522599661807 +416,1.3549265846304421,9.990276162396738 +417,-10.179634209317538,-6.551993593527024 +418,-2.873417246165876,-3.8708787407862117 +419,-3.5838799755018966,-7.238830183356529 +420,-8.66797609357964,-8.756997215451886 +421,0.7780488587914264,11.206822640440464 +422,-9.645810041253542,-6.512757622590329 +423,-3.618287088016478,-4.330567121113075 +424,-9.585955907487172,-7.564147078708663 +425,-8.740080835667838,-7.093178350044198 +426,-4.340047884211928,-6.289283842398694 +427,-3.539329548331343,-5.617207365017552 +428,0.8145812261578539,9.611219247626572 +429,1.5550445193814766,8.643999636842388 +430,-3.1660419049098456,-5.058791693995781 +431,-8.506868108800434,-8.230186674963203 +432,0.06495063517682231,8.597925521664141 +433,-5.1787863980691915,-6.790391702635773 +434,-2.3533743909411706,-7.154986513522051 +435,2.2132024398457215,10.20995443280583 +436,2.4179291103423024,8.770756439334107 +437,-8.050361016111665,-7.970934054841077 +438,-4.314910857410013,-6.592581802720625 +439,-8.468991499952105,-8.540475092737868 +440,-3.6745287386681724,-7.801683486768152 +441,2.6769491507548704,8.929830316657585 +442,-9.253226611110007,-9.648182830809313 +443,-9.229332311167068,-7.670127308808654 +444,2.005645904691318,7.548333527232663 +445,-9.811019277363656,-7.5469107802949456 +446,0.7705117663342075,9.434756976583303 +447,-3.193475478937037,-6.880808783112333 +448,-4.65246333395054,-6.350190774248163 +449,-3.924403785124271,-5.343525556631257 +450,0.5772856705077776,7.009740329844672 +451,-9.603555842930872,-7.081788382561963 +452,-4.140793777836625,-5.798587400881218 +453,-2.66124447687007,-8.585421104932015 +454,-4.4288076969023304,-6.34015358650103 +455,3.2987750889119223,8.891476862390457 +456,-10.311638892527279,-8.5792015261938 +457,-7.839470473020748,-7.80022621289589 +458,-8.824423145669883,-7.635995838947142 +459,-10.545470826021129,-7.975057715359307 +460,-4.090402313141579,-6.387145665940683 +461,-3.3930352700168953,-5.845673158148595 +462,-9.707906851582454,-8.178708047475155 +463,-8.453421549788144,-7.817722041984739 +464,3.7672808922012364,8.984680790128612 +465,-8.62245546740891,-8.99718474278544 +466,-4.105378275540775,-7.127140428716648 +467,2.289373378242087,9.032110991531445 +468,2.061570865234709,7.918839971777789 +469,-8.762917704970183,-7.593919503708617 +470,0.7466239496955772,7.202831373388115 +471,-4.166259049478555,-6.3568239760200855 +472,-10.254542843922076,-9.055265472063146 +473,-4.750741610871712,-6.30369747163497 +474,-8.741288195777253,-7.433891031459792 +475,2.744327795431177,9.639480684161692 +476,-2.7708325992894016,-6.0843946172169225 +477,-4.76250699365201,-5.666442283275416 +478,3.4120302342320437,9.791303820459628 +479,-8.456835872500541,-8.387123016486056 +480,-9.384927307809502,-9.039705459432938 +481,2.0001815688605764,7.458557021764323 +482,-8.973037900370395,-6.782326594034474 +483,1.1996468541091247,9.297809246320538 +484,0.8694674171783726,7.750627157483518 +485,-4.523949268699284,-6.068240796348103 +486,-11.114355667874628,-6.009628431691224 +487,0.22680744202711955,7.947038915303593 +488,-7.17883729541921,-10.007204254185938 +489,2.731860327313859,8.410358774077515 +490,2.369237333738112,8.118311518684367 +491,0.39718837250979067,9.689950662878346 +492,-10.594062983984935,-8.447903599968848 +493,-9.126887556946118,-7.586352247878466 +494,2.80772621833424,10.455228464896777 +495,-7.0700967084862425,-8.57191213659888 +496,0.2549569332878978,8.6630049015454 +497,0.5552308201889604,7.335795109030309 +498,2.358611871194677,9.42853369413311 +499,-9.854722791655632,-5.975132053549338 +500,2.243231929027873,9.028347430604606 +501,1.5712474856411065,8.388659822752174 +502,-8.402385914876355,-7.29554952535777 +503,2.20979232884183,10.17928859486892 +504,-10.374426936530666,-8.238281019318837 +505,-8.424020411194533,-6.068413075406102 +506,-10.439177579134729,-8.535513899022584 +507,-0.14858609159112524,9.867483448394383 +508,-3.789324744136027,-7.982485942795915 +509,1.5389893917548934,9.287642786529204 +510,-7.059422065959233,-7.768515167469511 +511,-8.273091794901928,-8.037743039265841 +512,-3.270714016837828,-6.443698650379267 +513,2.3719990096339716,9.488163674871737 +514,-7.494631778718563,-7.473736331488532 +515,3.177760269186437,10.301230325142157 +516,2.4847647075069466,9.044467921894118 +517,0.4747245556943478,9.479618388053982 +518,0.7307387064190303,7.344054451889517 +519,-3.718673556633533,-6.24751242488956 +520,-4.203934495595819,-5.477246935642584 +521,-10.956527197366361,-6.52839446404532 +522,-10.45540151347699,-9.656325885434203 +523,-3.3244805319519335,-6.651382585484791 +524,-8.6399626185148,-7.7781849322487915 +525,-9.31002459640484,-6.1691136682104055 +526,1.099132145651921,7.5104262897193435 +527,2.093363894780156,11.175939460291444 +528,-5.360987834135632,-7.628857486729695 +529,-4.179015108657714,-5.247315172788372 +530,-8.127839977129517,-7.414407795917798 +531,1.158039802878669,9.249295664364052 +532,1.0128605891160136,7.976993569544224 +533,-9.875102589864639,-8.781677789193871 +534,-9.22013369599315,-7.5753618079370195 +535,-9.593881768546728,-7.725725079429768 +536,2.688601106716163,8.004973440849191 +537,4.957863449887785,6.824437855793683 +538,2.027139370684587,8.443151933376354 +539,2.816276129933087,6.992273825369817 +540,-3.703838852513523,-6.2325005307085 +541,-2.281266775913224,-5.443136852045913 +542,-2.677881761014138,-6.8500830556833785 +543,-3.8185509912833515,-6.770154442633775 +544,-4.474543958990265,-5.364280370240795 +545,3.6302758672036366,7.870772188721467 +546,-0.4954173840642997,8.83354903535833 +547,-4.933280089876337,-5.261445371517363 +548,2.769698704951349,8.316545779971348 +549,-7.440664489021605,-6.180802953856893 +550,-9.473443460332929,-5.919257247274239 +551,0.9554491225982502,10.210935616318974 +552,2.7318748547755574,8.477537572978214 +553,2.9389732030994145,9.34148017129097 +554,-2.5692777445315715,-8.844791418470916 +555,-3.2603924590832447,-7.938713700724147 +556,-9.189116520727595,-8.468832973746789 +557,-4.557200370061501,-7.000110051833846 +558,-10.310033488435808,-7.495409782173115 +559,3.5759520905493143,7.932552255378711 +560,-7.327502792697469,-8.064658951024994 +561,-8.644168033316486,-7.906749723023718 +562,-4.5949896500763865,-5.43463703703466 +563,1.4180919145570554,7.026326398380954 +564,0.45599436901834345,9.294987403447209 +565,-8.929436959993833,-7.480089690122106 +566,1.7175477484811223,8.364498296468984 +567,-5.467083094870308,-6.522829897019783 +568,-2.8260330465980656,-6.314183995088117 +569,2.7994661468285633,8.627035021451887 +570,-3.8130436612767333,-6.798958321567207 +571,-3.607663430217996,-7.5267925631239905 +572,2.228804491245238,8.375852541779631 +573,3.4434183417396698,8.798086800122887 +574,-4.162618879661871,-7.819197266456447 +575,-3.7534828423258375,-5.381187830197566 +576,-7.461643143042459,-9.823978973461502 +577,2.54046667023458,8.086545273577975 +578,-7.650623637676037,-7.801223218648066 +579,-1.9356176107537841,-6.0676767574872805 +580,-8.725560271354123,-7.708164129909977 +581,-8.145985855304046,-6.633385506488888 +582,2.733082524628241,9.760364921676338 +583,-4.42086680978624,-6.895980047202914 +584,-9.943451626876106,-6.986204805483118 +585,2.973971827819502,8.445490492913766 +586,-3.1652790392218155,-7.066419255292364 +587,-7.877341354752501,-7.709253650253533 +588,1.382700039806736,8.552424788289164 +589,-4.549904887161464,-7.329165511971282 +590,1.562977517993382,10.431232258726215 +591,3.3463392740883675,10.257316238901439 +592,1.8017955090274103,8.107285812647607 +593,-8.06862812548864,-7.566035062135063 +594,-9.730384956127192,-6.640099526564223 +595,-3.7343824996753328,-6.603934480176569 +596,3.0385396810769523,8.263913515332236 +597,-9.600772140819236,-8.457568621982452 +598,-3.787016951021117,-7.062040066347958 +599,-4.256871138865526,-7.020373510207024 diff --git a/Libraries/oneDAL/daal4py_Distributed_Kmeans/data/distributed_data/daal4py_Distributed_Kmeans_2.csv b/Libraries/oneDAL/daal4py_Distributed_Kmeans/data/distributed_data/daal4py_Distributed_Kmeans_2.csv new file mode 100755 index 0000000000..113f76e0eb --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_Kmeans/data/distributed_data/daal4py_Distributed_Kmeans_2.csv @@ -0,0 +1,601 @@ +,0,1 +0,-3.35868895962526,-5.826429539223068 +1,3.4355813134213773,9.14268041217317 +2,2.166199737114292,9.276004069100377 +3,-4.806514075257109,-6.176686810954834 +4,-9.822523562636738,-5.869481622132916 +5,-4.621139272574359,-8.060855434412078 +6,4.4747219357668,8.273629260511495 +7,-9.728530116227626,-8.220807598229582 +8,-2.6866283098463537,-5.020025295814362 +9,1.378767171150755,9.602659656493188 +10,3.867626479978728,10.522258109676596 +11,-10.019643967118993,-8.561204961208864 +12,3.1671710020026347,9.097971021194063 +13,-2.2081924833320143,-5.444843455012178 +14,0.23912793474722327,8.427876901433995 +15,-2.987759541591694,-7.08595589387631 +16,-4.1840159787834565,-5.0702149083565455 +17,-8.486533819214419,-8.667510627344571 +18,-4.494290823461447,-7.150450890392653 +19,1.2558119967323405,8.197899103301312 +20,2.1818213511607967,8.746432335591942 +21,1.3200760815286126,7.41796544553869 +22,1.8904972897281256,8.57257643043655 +23,-3.7193485363950143,-5.240623044135857 +24,-9.457513160532276,-6.671618580818271 +25,-5.231562221340827,-6.12900179615051 +26,-8.986651931095025,-6.837717212762383 +27,-8.207662597018455,-9.536569360167618 +28,-2.049579670066906,-6.472921400710251 +29,-9.639524600833964,-7.788209927657737 +30,-8.776334793089557,-8.156582541009008 +31,2.737094237923163,8.313448211969183 +32,-4.107258913773014,-7.253203528297377 +33,-7.614650904540343,-7.4661944917992304 +34,-6.427204478513291,-7.869472168097781 +35,1.2433061686957236,9.401338437774866 +36,-8.345174343300046,-7.4416437028036295 +37,2.547516417807357,7.45999636660081 +38,-2.134891928744789,-6.4757578017081014 +39,-9.259617221578104,-7.433731876292357 +40,2.4602912507737704,8.33101477113538 +41,-7.11571685833997,-7.048124365407645 +42,1.9126215443602343,8.681073040147224 +43,-3.9819779238702417,-6.99549817585417 +44,2.146158542856288,7.524796048577623 +45,0.5047319904292489,9.707045212565282 +46,-4.79657171125051,-5.501610863813567 +47,-2.46860991668539,-5.641766825613358 +48,0.6817225428391449,8.578205706512971 +49,-8.100503530554903,-7.807441348293653 +50,-8.650108390576605,-7.182412438427437 +51,-2.9958916237601296,-7.2583161696348695 +52,-3.941428858694723,-5.603165011366886 +53,-6.800720306446685,-8.310034132246175 +54,3.0545534181864817,9.084076988928487 +55,2.0408368304385878,10.36767129196232 +56,-5.485417828951816,-5.240275388904108 +57,3.5000948151612956,6.705805820263015 +58,-7.139992840170567,-9.637652647493104 +59,0.534345563443934,7.106713075523121 +60,-9.487190323558064,-8.643608714628892 +61,-10.946503509367886,-6.564003587764136 +62,-9.746889323072498,-8.275629851067816 +63,-10.751762571354863,-8.650720315364532 +64,-8.181450507546291,-6.911589426422167 +65,-9.963366436417864,-6.471998383848797 +66,1.298658872512382,8.687648446221763 +67,-3.350451117732089,-7.641549588114813 +68,-3.335282658114801,-5.598879748725009 +69,2.4556217150368376,8.58131342744841 +70,-9.2643768456267,-7.785201070272984 +71,2.899391735480882,7.911056585495816 +72,-7.942368048664481,-6.5356540345559635 +73,-2.651999504735985,-6.722436935826184 +74,1.7357972633743375,8.854520143412076 +75,-8.82523623566712,-7.443574374185343 +76,1.5658105126410122,8.897134555679377 +77,1.0846406534215636,7.805574898153825 +78,-6.2598035016727085,-7.191736037479638 +79,-8.0339143487797,-10.148106920133172 +80,-5.3089974641738005,-5.60796719232985 +81,0.5136605842533299,10.77829733806304 +82,-10.820330406425374,-6.8756069145675855 +83,-3.51663431135422,-7.8431676677974655 +84,-6.004239004261249,-9.01394660786411 +85,-2.6056363159115494,-7.080218521917058 +86,1.2708644552463981,8.280037053421355 +87,2.5750902829464914,10.762453016509184 +88,-2.310851729093784,-8.213899267416405 +89,-7.739093835667072,-6.693967868173796 +90,-3.5167808443889865,-5.732994482494412 +91,-4.625341959670964,-4.96359984148821 +92,-8.008935739663128,-7.917812022983215 +93,-4.035125440587534,-6.154899480363435 +94,3.266735037731962,10.083283351526973 +95,-9.321868948210108,-5.880898687288509 +96,-7.619112400661499,-8.278278629099495 +97,-7.916933316243835,-8.028861671937905 +98,-2.8922942877836904,-8.467058564910289 +99,0.6427336948274962,7.494782339842734 +100,-2.591894090735525,-5.623507523030862 +101,-4.233901683179618,-7.669608911240129 +102,4.077130089568358,9.739324028463148 +103,-3.7865516046932344,-7.301585981430177 +104,1.9373966198229344,9.442135245411798 +105,2.351472667527091,10.964720984200605 +106,2.259612363924928,8.666073474582355 +107,-3.4241252242756253,-6.80999732837145 +108,-8.484768711332697,-7.2980027089825175 +109,-8.305629765600244,-8.941137471494109 +110,1.2529692312854797,8.604391379735322 +111,-11.35119724791917,-8.301555544464481 +112,2.462216783210357,7.139435610827717 +113,-8.882140125130165,-5.961494357428298 +114,0.946523796780079,8.181139910228348 +115,-2.4743126495981844,-7.727722592953043 +116,-9.039188822190084,-8.943038932209545 +117,-4.306513385532167,-7.996291047500129 +118,1.7809907796220739,7.885617616039458 +119,-5.052854268602579,-6.436944776620838 +120,1.4857201203970574,10.11517675982338 +121,1.6593339300096615,6.538529282746874 +122,-3.301320021315037,-4.892692836144352 +123,2.2952744146262902,9.53089131246139 +124,-3.633765823556005,-6.310258913148314 +125,-7.461306205677059,-9.183803206577624 +126,2.1180913009139815,9.822597368237922 +127,-10.482056661389045,-6.830206837534433 +128,2.6557271763292967,10.012672354815626 +129,-3.6887394754006624,-6.681448861129166 +130,2.0526122706345244,10.126031120554 +131,-9.841517196358051,-7.642792994803499 +132,-0.1909503321475008,8.836758159529273 +133,-8.20962763862538,-8.663168824531486 +134,-3.082675678680314,-6.264927899309817 +135,1.7012279320561425,9.349539782977981 +136,-8.473941725240367,-6.192204136700906 +137,2.503751749735985,7.062362532878443 +138,-3.220248568089246,-8.12030731635864 +139,1.9310710128544837,10.019162630939851 +140,-9.433299379879964,-7.916117032617272 +141,-9.398968398973636,-9.408774226495124 +142,1.274811236642502,7.670054827039406 +143,-8.686273070305473,-8.148470148564801 +144,3.0444060537617714,10.36894670129741 +145,-8.787124666481784,-7.019780149417274 +146,-5.2403238395313725,-5.518098459118206 +147,-8.547542232490287,-7.5646976527021765 +148,-1.294372186921871,-5.486620013420126 +149,-9.931709264181752,-7.707463995343656 +150,1.8221329951589387,8.909619351308766 +151,-5.295664816085174,-7.08816085358701 +152,1.6944517864829436,8.622139688485015 +153,-9.849324736937627,-7.085781173605372 +154,-2.855985363889836,-6.579883060827188 +155,2.352132228282092,10.730356750386534 +156,1.772757926613877,7.900294244917435 +157,-3.270863798908827,-5.019187468844261 +158,1.528900578257351,9.246368843572169 +159,2.2320251542494427,11.008424116757629 +160,-10.175365813198393,-5.788379745910216 +161,0.5010976081720266,10.245478426775488 +162,-7.32001007404374,-6.419661704988274 +163,-4.721250239238775,-6.912615087252899 +164,-4.101986290415056,-7.334289906839879 +165,1.7123704438732543,9.443500763688121 +166,-9.86489135797329,-8.548428587904317 +167,-9.879669671065848,-6.944854821066275 +168,2.4445212219454864,9.559402984365923 +169,-3.852292461664126,-8.2681335481681 +170,0.8967997675511092,8.949548310125508 +171,-2.721995139277326,-6.09361457876869 +172,0.6438945543625096,8.386403782397283 +173,-7.9521654231971555,-8.392585667208966 +174,-3.9798384775388995,-7.7644599483475005 +175,3.054869087466681,7.650740654624071 +176,-8.917177539147152,-6.788692998439901 +177,-4.199315749342823,-6.649741955139275 +178,1.8446071377761974,8.542879630314024 +179,-10.370865833658097,-7.678558090112394 +180,-8.696610248438073,-7.80683884438522 +181,-4.591504741070676,-7.062977197077918 +182,-8.053309769515488,-7.328243543115297 +183,-4.3393288903737925,-5.359712477759928 +184,-2.6995807753878003,-6.716037188715158 +185,1.5722319880345381,8.81760158278003 +186,-5.197935388304108,-6.586627943659147 +187,1.4161911903308217,10.304612024805262 +188,-9.847186133153695,-7.6066732601412665 +189,-6.231331539324444,-7.292814833581683 +190,-3.4376902697810734,-5.6248900702480285 +191,-9.742391009540542,-5.493005459363653 +192,3.1544127997119564,8.969868118655281 +193,-3.3628339167225594,-6.5962869153545505 +194,0.4407989308348059,9.3750109172608 +195,-8.045589824652708,-7.336897559864184 +196,2.192133258753018,7.521017403589007 +197,-8.028420047145206,-7.12446546477376 +198,1.356756615707675,9.12045201318182 +199,1.7822482709057093,8.345210645186462 +200,-10.291191739926187,-6.099503854703206 +201,2.1553141492470718,10.304125715217145 +202,-3.891268021938617,-6.330393265614692 +203,-8.142559515236046,-7.651378613926669 +204,-5.414873889952844,-5.667608738502075 +205,-3.90285132381985,-7.021129722177164 +206,-7.839166310122564,-6.713653310984322 +207,-2.000718895530837,-4.8561327476159235 +208,-5.053225707353607,-4.665893734982673 +209,2.5280984151345662,10.282375528861602 +210,2.178031020209426,9.03074701038035 +211,2.2896565607400836,6.956175652238886 +212,-9.909257398886236,-8.025783287508078 +213,-2.7373874721053593,-8.129760447528312 +214,-9.470339427494082,-8.389456711580193 +215,-3.9965672509624453,-6.495812862881321 +216,-8.395219635108448,-7.834675563116395 +217,-4.941986125961326,-6.834141775734095 +218,2.2327108302998697,8.41414362862347 +219,2.287295859451085,8.887644780143772 +220,-7.845944624320129,-7.190703571009132 +221,1.5401686097423166,9.790724893103178 +222,-7.67880957172732,-8.133117602959778 +223,2.1861841301846354,8.78196937599449 +224,-4.238143352008502,-7.671738818280288 +225,-3.0965498171287154,-6.071726245923383 +226,-11.004835607039393,-8.716848062693376 +227,-9.175832646629356,-7.253346900483777 +228,2.1286414655151087,9.436285021740265 +229,-3.2460956238234475,-6.1200744731642 +230,1.6122318915319978,8.303560835915217 +231,-8.297627223448073,-8.98344888551434 +232,3.443593743589413,8.376205894418318 +233,-8.972835888261347,-7.907032371222781 +234,-4.819037441441789,-8.31956758929415 +235,2.7363876771241573,7.9143227643855 +236,-8.732282408784988,-8.40806966947132 +237,-3.270978516264587,-6.548173676954751 +238,1.0110716985856187,8.395317366377705 +239,-3.567051433094067,-5.961931965707605 +240,-8.974193176246727,-6.206988063769661 +241,-9.043580853434083,-7.578510466567455 +242,-9.825069377691243,-7.571454147497201 +243,-9.959202095503522,-7.757246268235972 +244,1.3112402040228384,8.708110207295846 +245,2.344417307917217,9.369574003359213 +246,2.1960978718119257,8.24373728738131 +247,-3.2994435945185496,-7.5167229946344705 +248,-2.9312612310190773,-5.86448053055016 +249,-5.6669205411466494,-7.73339528829164 +250,-10.773118997012762,-6.988895450707375 +251,0.44480349376541417,6.9062779746970735 +252,-4.829257856989387,-7.204356334213447 +253,-8.677214638678043,-8.911973906763063 +254,-4.341732055023909,-7.2440079453847215 +255,1.4325652748070907,8.989269695787607 +256,-10.234658135953211,-6.145124733151453 +257,3.8551057478274746,8.970969603145495 +258,1.5877285768403273,8.231664001979325 +259,-2.9810915170909293,-6.522368740685841 +260,1.1361107128315253,9.256875908157047 +261,-4.463584274422763,-7.337956883380673 +262,-2.221673494768367,-6.61700999368102 +263,-2.5272851092134845,-7.245398720606795 +264,-4.626595662051341,-8.111892439896705 +265,-2.4010013889885258,-6.60762504208151 +266,-10.453008187368908,-9.277485875834412 +267,0.9807962426766026,8.525032007539878 +268,-5.529838919610108,-6.46897247791777 +269,-2.7850067892399637,-7.79461325635951 +270,-9.016728365455332,-6.736368845268928 +271,0.19613786668944466,9.145783871679404 +272,2.6158780164924567,10.526752836282698 +273,0.8735491475411368,8.748009691123961 +274,-8.521798633270743,-8.365648072531284 +275,-0.20211009113790102,9.45960548467821 +276,2.094411277506846,8.581819267882855 +277,-4.84706997923652,-5.438319811144145 +278,1.533227470384584,8.891516033673929 +279,-9.345243191550866,-8.953088415079424 +280,2.1968837683848186,9.357393775028457 +281,1.9676360256255412,7.905775199259102 +282,2.7666406810494615,8.893172489221879 +283,1.1123331957549405,7.089192550064636 +284,-9.019325251132761,-7.907631659489907 +285,-2.893192161876666,-6.814903945239156 +286,-3.0269458041799973,-6.059131577331172 +287,2.570401696944053,8.305842017624176 +288,1.5655854042904174,8.74156655418919 +289,1.0628175690130237,9.762262480977503 +290,-10.840015823233166,-8.682333870606595 +291,-4.373218793921528,-7.171230408886504 +292,-3.2000459048433614,-8.783724031606651 +293,0.856161136022519,8.920792062660606 +294,1.1841844787645357,9.237269039840463 +295,-7.439762544783075,-7.479858626024707 +296,-4.227927571641294,-6.936506216069389 +297,-3.2625108713404107,-6.4710368562105485 +298,-2.5931960537310177,-6.818546536603943 +299,-7.710660587588098,-7.139909842153512 +300,-2.6538339208759245,-6.528345062073545 +301,-3.645592242498573,-5.712865136712878 +302,1.7976882615408294,7.681012266921148 +303,1.8957060092820301,8.948752372953674 +304,-10.209732995673647,-7.379583885916444 +305,2.4898870588344018,9.67522366010307 +306,-4.656201688465136,-6.4040923032719395 +307,-4.178020157153271,-4.263163447068377 +308,2.947568746150873,10.41016287956812 +309,-10.311337650813531,-7.7016387455981565 +310,2.899643832061082,9.553846364823723 +311,3.4031947099000304,6.793837809132251 +312,0.20789685988955853,8.703813212813328 +313,-10.342936071131309,-7.335644866798318 +314,-3.837916986133941,-5.64697648617353 +315,-8.946377468568006,-7.6440966476104535 +316,2.2023859840552835,7.65246528379812 +317,-7.944711603608673,-8.52101235152373 +318,-6.887057552825117,-8.421060347430048 +319,-9.144718100460349,-8.138501182562196 +320,-8.69398046588213,-6.733942468119798 +321,-8.605288738024067,-5.8113180142678615 +322,-3.003901781953935,-6.678674367936518 +323,-4.069352394765035,-5.498412068615528 +324,1.3862411272877035,8.051730244382817 +325,0.649524922637946,8.195130900829021 +326,-5.551270778837715,-5.688935361303699 +327,-7.365694900240971,-6.710256401860435 +328,-3.801798115837755,-5.421961104436759 +329,2.0087239251474527,9.132767223361697 +330,-2.7804797280226135,-6.567233465468077 +331,2.782014961246936,10.009510594547592 +332,-2.3492644502138282,-7.350733351497415 +333,-4.206831963234683,-3.4421170183703773 +334,-9.604973346130063,-7.731200202743458 +335,-5.004073768118336,-6.70213921936662 +336,-1.8628613842403539,-6.609383400201124 +337,-3.952870764311914,-6.525559378756727 +338,-0.08365741281414563,8.927430260113754 +339,1.7542838569691432,8.24173217837714 +340,-3.7262691189584594,-7.652280786853464 +341,1.5580318808749487,8.603542840151066 +342,-2.371466118704887,-7.187780307497872 +343,0.8290263218300455,8.252906060095047 +344,-10.432328322940055,-9.203523570365311 +345,-4.259914492517776,-6.565955647408584 +346,-10.774445675551492,-8.899759099508032 +347,-7.809371437533612,-5.966188204402244 +348,-2.6606718377353547,-8.53042766347821 +349,-8.287999454561152,-7.9341340655673305 +350,2.86466120959558,8.883999015070891 +351,-3.102741786258152,-6.214305342528414 +352,1.7402868823328423,8.066138050333372 +353,-8.058843812732409,-6.082200097858894 +354,-10.538382508045645,-9.15746793794495 +355,-9.713297716658428,-7.184332169664756 +356,-10.437879087402509,-5.268573955836219 +357,-2.734892534740736,-5.98875513800358 +358,0.5226987795344917,7.294056764420545 +359,-3.0607384898678216,-7.7585968388597655 +360,1.6936963900367203,9.665414464515292 +361,0.9202265492009503,9.447548712887738 +362,1.3890555748632991,6.0066285825575925 +363,-8.874206745756213,-8.280187276924956 +364,0.5136443196089762,8.817861363688756 +365,-4.775571604113167,-7.198194850532794 +366,-3.341858478491677,-8.101998132270335 +367,-7.99367565381126,-8.882547523354486 +368,1.9337420700556978,10.01666620357398 +369,-10.395557819411785,-7.508024562561236 +370,-8.55702189527164,-5.68806053209533 +371,-5.13844879794808,-6.223853336672088 +372,-8.197201783076405,-9.215423475535736 +373,-8.278897263977411,-8.55048452631327 +374,-8.66197140398328,-8.248117237229634 +375,-3.006743479262847,-5.93423353306401 +376,-8.267782020725011,-7.778194164911808 +377,-5.083553805875464,-6.016886081312889 +378,-3.9407723920511106,-6.464315298228791 +379,-7.083564965305358,-8.615057127161643 +380,-3.90779724587636,-6.604006945277915 +381,-10.7012857676038,-7.568656886403618 +382,-4.116603353431634,-6.150041930350558 +383,-8.278883798618557,-6.804438950715207 +384,-2.827373858118777,-4.551046282794583 +385,2.158307636228448,7.145022958514378 +386,-8.137932040124426,-7.806895797410348 +387,-11.131194018210197,-8.413216117496031 +388,-5.788425975700523,-7.953896326150473 +389,-4.620097654178233,-6.310484872527448 +390,-8.478880483642358,-9.204281358114084 +391,3.504208761427533,9.36879171541479 +392,-2.926987256481567,-5.491045552832883 +393,-9.476155252156401,-7.701364445205184 +394,2.229614102651407,7.71008891054642 +395,-8.30997775631249,-6.911040660872131 +396,-4.71974410585834,-6.38576628604139 +397,-8.056280287222378,-9.280595305783406 +398,-3.4225775155743694,-7.878439916434665 +399,-2.0853909321871917,-6.879355180933256 +400,-9.319525183116145,-6.8769547083138844 +401,-8.245983352751873,-8.321261728811903 +402,-3.3714298668646565,-5.950275017858789 +403,-9.00988343723979,-9.298496814665032 +404,-4.10592381024369,-5.918628469975301 +405,2.5936068911339762,8.76714571096227 +406,1.715426613235632,8.545618375337266 +407,1.8386101193460516,11.151159191103638 +408,3.2441170619894777,6.416826938113145 +409,1.191626983310161,10.46818371366243 +410,1.6430030126260013,7.929862784787724 +411,-3.038543958519568,-7.463916995199537 +412,1.3201160157782514,7.564886953060057 +413,2.6514050368865023,9.864242492056405 +414,-7.969901511981649,-6.534473589445678 +415,-8.518470244643877,-6.898948871818382 +416,1.7977746647612987,8.468637650035655 +417,-8.376124454197242,-9.028969361919913 +418,-9.122897845891714,-8.239785318173022 +419,-5.289802430217316,-5.992438020441808 +420,-3.635438082896571,-5.730273861471589 +421,-9.77253536308097,-8.261125493159623 +422,-3.4221666763669836,-7.268500499835926 +423,-9.611020882151013,-8.769076789301296 +424,-11.325229986370699,-5.705853262440593 +425,-3.365929740057175,-5.644627040063918 +426,-4.900398230837069,-7.061212971349762 +427,-10.171859893388046,-8.275274941644462 +428,-4.575123981001321,-6.452261305928669 +429,-8.135563136348908,-7.716422477177896 +430,3.7322331465984124,5.422108729599827 +431,-9.29526213846877,-7.626578669444718 +432,-4.237566821627697,-7.55005737896009 +433,-9.183361948844631,-7.570684255292109 +434,2.7097980229426373,8.43958518362943 +435,-9.192923549857293,-7.53471946498263 +436,-9.67752430211944,-7.410216574460519 +437,-7.967875902628163,-7.559640794070247 +438,-3.3319935812854173,-8.396814624265595 +439,-3.5940064636472444,-5.374015552057795 +440,-1.5492094557452,-7.17812214097376 +441,3.2600412739553857,8.651218199782074 +442,-10.184282349192125,-6.604160884644846 +443,-3.93421035676419,-8.314076821804923 +444,-1.7680480218377566,-6.7506327668413215 +445,-8.407416462813392,-6.185746288386427 +446,-3.332866268516075,-7.911213684246127 +447,-8.547386855384353,-6.619893671873742 +448,0.6418789264448461,7.268533137555316 +449,-3.736679259883213,-7.429561167168869 +450,-2.4183282579016465,-5.5442011571539 +451,-3.7143354613884396,-4.9109581342459 +452,-8.653352329841526,-6.758439737388211 +453,-4.817456185387494,-6.298423954607866 +454,-9.413912300534099,-7.663639080709745 +455,-4.793771934490004,-7.060045755486593 +456,-7.941843972547384,-9.594954224784123 +457,-3.902419674554519,-4.631760045687949 +458,-3.9825829521920357,-6.364153621508403 +459,0.8621510490630231,7.668541143487601 +460,-6.650141102609025,-8.218777499609105 +461,-3.8472827531400733,-6.597442445907043 +462,-3.569322199391855,-5.9645134407216664 +463,-8.146286308000464,-6.128937296910369 +464,-9.750535050249626,-8.51696139122317 +465,1.7159090210086831,8.369375333648767 +466,-11.121195487486151,-8.788963931131986 +467,-4.074627342273818,-5.891460062736109 +468,-3.0347257670849856,-6.0028152286746925 +469,-9.186386987296094,-7.920976668130999 +470,-6.7905433465090255,-6.3612687635612355 +471,-5.554880340398808,-6.636631911492183 +472,-3.4352895448772296,-5.731896371561744 +473,-10.838899284430546,-7.152570323659559 +474,-8.54860352122666,-6.5216233759320685 +475,3.161878344164532,9.347916489345463 +476,2.9666892258902706,9.643799186052602 +477,0.23182081567599044,9.114102444020164 +478,-8.055967894993012,-7.482813770441974 +479,0.36569616829843854,8.39705531037486 +480,-8.314137471818114,-8.158984139838683 +481,-3.9219865911561036,-6.024563898363107 +482,-1.8470686606663926,-6.064940124304251 +483,-10.010005981707781,-5.847878391023799 +484,2.0841261866243816,9.752415577568359 +485,1.2196490622390979,9.044615878836572 +486,1.5556686737158538,7.4190159791173365 +487,-5.169309568731435,-6.464305198493605 +488,0.8678704288717831,7.624208081599757 +489,-2.8456583331275023,-8.04100291426108 +490,-2.8564600285075823,-7.1373502752514515 +491,0.8086140671702274,8.262778160520003 +492,-3.625017072198839,-7.548857262168908 +493,-5.204686260531217,-4.689976310520366 +494,-8.40229914634969,-5.660631753126961 +495,-9.307312585572348,-7.153677154915194 +496,2.5529579725720186,8.299066074733663 +497,-4.41328800219494,-8.01017206969573 +498,-2.7960777128921808,-6.852967888455779 +499,-2.5208586601654543,-6.504870551634729 +500,-4.691768444229939,-7.613347696500981 +501,-3.2757809197875303,-5.590685062433759 +502,-4.250152895681304,-7.285967822690733 +503,-9.204293263845981,-9.084210002503024 +504,-3.0196361833879566,-7.004943430493599 +505,-8.562991571522256,-7.171363619659097 +506,2.7529256487972065,7.488883167233098 +507,1.830505520850492,8.892999330798872 +508,-8.161948836267698,-9.532139285875404 +509,2.7431477646713143,7.973300601077908 +510,-3.5069453753898236,-6.23879032061062 +511,-8.317256487913806,-6.618204024712531 +512,2.300759448648913,9.528896374784418 +513,-2.8552241932608386,-7.267866191046745 +514,-2.7046388629725895,-7.873394665205723 +515,1.0449359642174199,9.346094109846385 +516,-3.0592566171782027,-7.785107890662386 +517,-9.439183598162222,-8.093294870053596 +518,-3.68735504474474,-7.225645765490254 +519,-3.5596360828045785,-7.18260924418626 +520,-2.973973904340412,-5.705324949803886 +521,1.7716757659658577,8.761365543878778 +522,-9.390697751969299,-9.450181516627964 +523,1.6123347926763898,9.775468320939863 +524,1.8646981608070459,7.93117601131186 +525,-9.596169716821008,-8.178547610350808 +526,-8.541383684734354,-7.842370307620071 +527,1.1131363473263765,8.825753133445415 +528,-10.244261040786068,-8.081342842374916 +529,-8.579807002681788,-7.135852360203103 +530,1.763920120447527,9.845449563376503 +531,-5.51546353846694,-6.943804429871649 +532,-9.14405296164511,-8.195695340516782 +533,-9.035974710668595,-7.6202987380717735 +534,-3.2615633531068053,-6.701705025156346 +535,1.709937162564638,7.14722221654483 +536,-2.245666954425036,-6.883861614887863 +537,2.3677130438104808,9.106566897279938 +538,-9.458167693002014,-8.503042821677347 +539,-4.076697391592253,-8.019789647543242 +540,-9.036595223062285,-8.287709453993642 +541,0.6758589143112543,7.8500170460493335 +542,-3.75132705779662,-7.216694471611288 +543,-9.272126552693619,-8.548950171460847 +544,-9.554854832314836,-8.917633282884536 +545,-3.0596964259952006,-5.2968119716487685 +546,-9.802026620811418,-8.074413213875708 +547,2.2378712511043544,8.327043087398575 +548,-9.117971064828826,-5.667870797754498 +549,-8.569344288306812,-8.101000886809317 +550,0.9240764673458738,9.528352749876202 +551,-9.82004919711966,-5.70373762491799 +552,-7.630307522131641,-6.017151383129318 +553,-2.9524525381669573,-7.570534768535014 +554,-3.4125128861756826,-8.645715848727079 +555,-10.625270472104175,-8.931403459118163 +556,-4.694057475120292,-7.911445423833883 +557,-2.25619832583494,-6.65809120171658 +558,-9.2780474833441,-5.696778701875973 +559,-3.6784846740185855,-6.082444716669731 +560,-3.4293429064006276,-5.847821903492798 +561,1.9439709651388997,8.648073360413566 +562,1.4819223967537078,8.035868707853556 +563,-3.7699715009405357,-5.292806218828125 +564,-4.877792159215375,-6.009914407133835 +565,-8.803381135351215,-8.33315553434123 +566,1.0284225741506225,8.750446127211628 +567,-4.105377571728074,-7.401860844543055 +568,-9.685248150269778,-6.149500712625501 +569,-8.724619020013902,-6.343281306650992 +570,1.2889792874675887,9.189494923898836 +571,-3.7788197836319144,-6.833095678512737 +572,-9.522780233052375,-8.018520939310838 +573,0.017303362231853914,11.12668688502151 +574,-7.870209813144006,-7.871340705112821 +575,1.5463790222165257,5.981627896367072 +576,-9.50850094688113,-7.4139405643272625 +577,1.9962100293771334,8.70032989112369 +578,-3.6308999273375187,-7.1807564718528285 +579,-3.0599072191610253,-5.156944837721683 +580,-4.015838971377803,-5.628626943136464 +581,-4.3059399615101945,-7.0187684115528794 +582,-7.69025302026296,-7.981267867885812 +583,-9.25381132352528,-7.327874304532358 +584,-3.681895830814411,-6.992466985298881 +585,-9.93429116381224,-7.552250100807363 +586,-2.904660034160202,-7.175008970106031 +587,1.141690863237407,8.264628154046255 +588,-8.021619535904726,-7.629673175480269 +589,-9.634212614246236,-8.249125800630479 +590,2.102157219741975,8.860344265071555 +591,-7.817941391703549,-8.641911617703295 +592,-10.291511817990084,-7.748716057362189 +593,-2.355180558375901,-5.002047560018639 +594,-4.122747380585646,-6.653506344669552 +595,-4.258002300759403,-8.079546417995866 +596,2.894269851113211,6.4558135431821775 +597,1.4200834497833223,9.27022793492442 +598,1.1775973337547503,8.944446025590066 +599,-3.6292799609187383,-6.561517231896472 diff --git a/Libraries/oneDAL/daal4py_Distributed_Kmeans/data/distributed_data/daal4py_Distributed_Kmeans_3.csv b/Libraries/oneDAL/daal4py_Distributed_Kmeans/data/distributed_data/daal4py_Distributed_Kmeans_3.csv new file mode 100755 index 0000000000..ffd208447d --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_Kmeans/data/distributed_data/daal4py_Distributed_Kmeans_3.csv @@ -0,0 +1,601 @@ +,0,1 +0,1.2255376526898032,9.141479217773881 +1,-4.297337043474307,-5.530916545335797 +2,-3.6087115377420087,-6.516947027948974 +3,1.7283245822395865,7.644297041081181 +4,2.394681190268282,9.679520106012909 +5,-8.540320716098458,-7.074720415703683 +6,1.5467345165080124,10.016750892066021 +7,1.116032103290741,7.389600128038769 +8,-8.564960747610074,-8.004840738257776 +9,-4.30699470103154,-7.210222308396672 +10,-8.324455580440977,-7.193745420077761 +11,-3.055746213116298,-5.738151226380286 +12,1.564107524552199,10.212558601072592 +13,0.3278426725521013,7.717067491079936 +14,-8.98260775540197,-8.445947915664142 +15,-8.148884610857909,-8.249832172954447 +16,-8.616360734466356,-7.991086807340608 +17,-4.995125679553251,-4.805418699792586 +18,-10.043205855988308,-8.38198798465999 +19,-5.117321263158294,-6.856273768969385 +20,1.376353630018705,9.192645073591452 +21,-4.251693900241542,-7.526106395251568 +22,-2.93596205716008,-7.236746464333438 +23,-8.808958437261914,-8.455460559549115 +24,-9.847650729901957,-7.851876184289234 +25,-11.048039228950888,-6.60393674080736 +26,-3.964409490913115,-5.299418515912615 +27,-7.0493565263673155,-8.740114248978552 +28,2.263311060790778,9.849075294679976 +29,-3.8024901458716647,-7.117190913344736 +30,-4.800964446022614,-7.504211423063539 +31,-8.870192169247716,-8.672027956256194 +32,-3.8386174296796276,-8.499766712668153 +33,2.4596144288951614,9.826078306198113 +34,-3.3803609919209965,-6.935355354171216 +35,-8.600888421369499,-8.081225402937271 +36,1.6990780728593955,8.026541885792696 +37,2.0234198603467184,8.078938358225274 +38,-7.861223420539712,-7.970030131717135 +39,-8.97420790996449,-8.139796716130322 +40,-4.274138163785766,-7.218007933270192 +41,-8.853199100620417,-8.693668385815021 +42,0.6026265514398386,10.98343483713762 +43,2.433240364895425,9.059774694791999 +44,-9.721084263354559,-6.5643821056343405 +45,-10.084805290383525,-6.870937627701416 +46,4.0936817036834015,8.872937261702587 +47,1.6929446489697524,8.304053977767918 +48,-2.8273918317995848,-5.174707032406225 +49,-3.6377369973212033,-5.981861133049716 +50,-9.127569607956952,-6.626326837736638 +51,-10.248572831708822,-7.548645388130816 +52,-7.855830617376127,-7.904014064483153 +53,-3.445179956310673,-6.464140555832437 +54,-2.6452978926620245,-6.439318643401205 +55,-6.39087632984327,-6.928394253403045 +56,3.5188547547156386,7.551672632060567 +57,-7.855783363668796,-8.094237123526709 +58,2.615776219644124,8.843205260668764 +59,-5.601959336375614,-7.439435156164771 +60,-8.87656188186118,-9.647791398501298 +61,-8.987652698755243,-8.313103579761762 +62,2.7378795714787665,8.846531747222823 +63,-7.108610717006393,-9.030089038622796 +64,-1.8963294779383923,-8.189264029877402 +65,4.071599487948465,7.716846493830238 +66,-3.6658654047687795,-7.47426414118374 +67,-4.035359477565178,-7.487029436292918 +68,-8.810170608211632,-8.379134703981288 +69,-8.20797702039906,-9.09686600955095 +70,-5.08018104907724,-7.2637114806271015 +71,0.6351331740071233,8.224606419467982 +72,-4.47822711506591,-5.352695184484038 +73,2.358168656987021,8.308710197655335 +74,1.004613507660405,9.410385162767355 +75,-9.361705481884158,-7.184962903676741 +76,-4.282438839606894,-5.53582099677129 +77,-4.1049753854859,-6.322210169062712 +78,-7.546042903965684,-8.210753335220534 +79,2.84173196981702,8.716906136750826 +80,-3.7536901854246327,-6.72560061365912 +81,-10.667769318904304,-7.643966842339937 +82,0.8150704800004155,7.6566085308868645 +83,-8.356933989588283,-8.589906034300064 +84,-8.801307660780632,-7.621502234241935 +85,-4.67280437236893,-7.0337426395333065 +86,-3.495159526615141,-7.7823625361508295 +87,2.0599669727110306,10.824626023348948 +88,-1.761515057182471,-8.248027717315765 +89,-9.092492983616513,-8.237898770994937 +90,-3.305040516751636,-7.26504640263574 +91,2.1227809787420706,9.194304511259828 +92,-3.145931808820589,-6.229095802364749 +93,2.889743289938168,8.929504976806188 +94,2.018863059789951,8.220482210983691 +95,-3.2196229174127033,-7.532972081961835 +96,-9.180006612215056,-7.060068395688889 +97,-7.986523717432862,-8.772168767696355 +98,-9.168083705283559,-6.822850835906491 +99,1.289775580475062,9.70271783585797 +100,1.2740322978213001,8.292177837742086 +101,-8.916527211291001,-7.656106436204208 +102,-9.700898376071635,-8.032415703940435 +103,0.9669161706345597,9.634344618512687 +104,-9.67270102358252,-6.908277651580308 +105,-9.564463430382272,-9.710466191751374 +106,2.2969425258369314,8.987649262904931 +107,-7.762023185100588,-6.743135723883691 +108,-9.255459961544892,-7.782531060262364 +109,-4.273137189588981,-6.127968184795229 +110,-3.7508291438510564,-7.1796650264303326 +111,-8.648220880324805,-7.172900439017217 +112,-8.658333970395889,-8.56713734626757 +113,-5.733518920545488,-5.010252542754743 +114,-2.3741665613138037,-6.357738603600945 +115,-1.9856093324216508,-7.617195339401777 +116,-6.704566032535892,-8.238334916994258 +117,-7.914571748360948,-8.921005996340748 +118,-7.58494093314669,-6.710379996663295 +119,-4.006968138276497,-7.798415227381543 +120,-10.00761330753919,-9.242893434719528 +121,-1.7128538688814134,-8.601998179774073 +122,1.2167337031493959,9.076757154496596 +123,1.6447608096299158,8.483163215726453 +124,-7.218522355872036,-6.5669042952039565 +125,-8.51184088492603,-6.910923174986802 +126,-3.1001125221811203,-5.295113344522024 +127,-8.73963648128209,-7.186510393418622 +128,-9.09764611141698,-7.534170213328472 +129,-9.780761540280578,-7.742981207657383 +130,-9.717247650860006,-8.77608768979335 +131,0.3003464179203046,8.95041049956845 +132,-3.5149896816997006,-7.909518081993374 +133,-7.963579115398975,-11.103666868682154 +134,-2.6241211231232815,-7.423013872519332 +135,0.8682862034790112,6.244997188436713 +136,-9.043625473149135,-6.909556510217293 +137,-8.730545963953597,-6.167287236441934 +138,2.836438826504372,7.731977744231563 +139,-10.136257512743747,-8.061638998745739 +140,2.5255796786313267,9.989763742863861 +141,-7.728567526641721,-8.516688228325041 +142,0.10498352981338144,6.936041644039058 +143,-9.878483086701534,-6.825949923646933 +144,2.1911668619430684,8.681924891111366 +145,-9.297423063665612,-7.006348028486722 +146,-7.399923203686196,-11.741207542012472 +147,-9.851057134480019,-7.664685319105955 +148,-5.821319298892842,-6.531365455033449 +149,-9.088485188036124,-7.266914238231228 +150,-10.508127705680996,-7.2971293307035925 +151,-2.776680306767184,-6.040444822335491 +152,-8.397895528644842,-8.939148856577797 +153,-2.6224385814925615,-5.942589208384663 +154,0.7126631210496013,8.427374154516944 +155,0.9069792571976608,9.431652824626497 +156,-4.941144074728792,-6.913310181617657 +157,-8.388660616639175,-9.781674687896398 +158,-10.15866878410844,-6.086199496080454 +159,-5.280330752512068,-5.8711826568517775 +160,-8.943632752777454,-9.021959929498285 +161,-4.2741222237074386,-7.468014406499503 +162,-8.111790708521093,-7.829244998012636 +163,0.034947129092518514,9.599563934157821 +164,-9.520977354855793,-8.34632175374682 +165,-7.157739814914419,-6.065387153893702 +166,-5.226995045988889,-6.111376787056115 +167,-8.22365310520065,-8.570490114550068 +168,-3.483702010774605,-8.258454646969218 +169,-4.675132818096483,-6.718383881062376 +170,0.8076322417226721,8.275565136333183 +171,-7.8602699422307944,-7.333797725105611 +172,1.979964867428537,9.865183123448809 +173,-7.947454000060463,-7.646911308269281 +174,-5.331960089565768,-6.341744172920323 +175,1.7885922368246647,8.340534722885964 +176,2.787746405882677,8.139148148558975 +177,-9.151914267743374,-7.569577432408877 +178,-9.129791842784071,-7.265252727817658 +179,-8.366843771779305,-6.496401516640613 +180,-7.1560198551097045,-7.7684193348064605 +181,-9.492772888554155,-7.319890377024424 +182,0.6053803088265532,8.075801111134975 +183,1.6434165607683415,10.271458125764944 +184,-2.739547260018664,-4.138099443059702 +185,-9.7037093490842,-8.790361694936095 +186,-2.7306182130527286,-7.208187200163216 +187,-8.134892576819396,-7.520714708050233 +188,-8.267745100216402,-8.385102540138817 +189,3.3415830473073695,8.734171325609722 +190,2.9311815000005996,9.143119401290065 +191,2.2860882129071807,7.846461236250382 +192,-3.6644673202519136,-6.438444904529135 +193,-7.523276825143395,-7.255807730935022 +194,1.868794551266115,9.76318232418679 +195,-3.296871001299072,-7.719880058593299 +196,1.5442319254883006,9.529188740813856 +197,2.3179624773871534,11.270953386109198 +198,-3.7262983945769776,-6.324564664256947 +199,-4.135269049518808,-5.070719500945035 +200,-3.9373005080229473,-5.689290940444298 +201,-7.205346385928806,-7.311384820544484 +202,-3.68907443689269,-9.752269221376734 +203,-4.685004081304692,-7.368110975680209 +204,-8.763720747411547,-9.290122994952846 +205,-8.872612765848631,-7.338108680713353 +206,-3.3042196564355644,-6.641155245652315 +207,-4.123653143016889,-5.644872475075601 +208,-4.766993102823317,-8.156348503780505 +209,-3.5700447726245157,-5.8794728348385785 +210,-4.132984951847543,-8.068499111058316 +211,-6.622827538964524,-7.551504765031613 +212,3.2137070158351415,7.4433402642109625 +213,-9.77406291453185,-8.293643281900614 +214,1.146176894713229,8.681501484663256 +215,1.2391984568734045,8.568405824674697 +216,-3.540082501244764,-7.192348313166246 +217,-7.532863915593086,-9.511638351944974 +218,-8.250126393231623,-8.764419022184814 +219,-7.482088206544333,-7.577130499788636 +220,-4.212302917624514,-5.289820445745089 +221,-8.261546908364274,-7.772718494251158 +222,-4.544762612440516,-4.842522228687262 +223,1.1387722979236394,9.192847076109373 +224,1.3535929181464974,8.647364618306227 +225,-3.612878181498939,-8.699470128470956 +226,-3.148044440176802,-5.1170201801674455 +227,1.354926291903442,9.499117985770674 +228,-8.65829008811044,-6.307401366393167 +229,1.2316993801929832,8.222465907459812 +230,-8.331872488428887,-4.853396673925863 +231,-9.100328499986468,-8.573730838333006 +232,-3.107786079685531,-6.1718712105409255 +233,3.2652035789984177,8.128662224265318 +234,-10.552602581625532,-6.761345255592396 +235,-8.907554671210626,-8.577143103377015 +236,1.4855632154992677,7.874244483310497 +237,-6.027901651287577,-6.787145221246898 +238,0.5556693033797273,11.480362416240872 +239,2.098624352712322,9.814242943395692 +240,-9.887555426056213,-6.710976909450727 +241,0.8649055384285145,9.097109122257002 +242,-11.078196900981547,-7.984561032228856 +243,2.316337179364928,8.598986846679601 +244,1.2640011485194367,8.82879738781108 +245,3.0267554273654462,6.784623431027779 +246,3.4220828134636765,8.502220645797342 +247,-2.4494211129155765,-4.999817909826381 +248,1.0421080438586254,8.824880864245943 +249,-8.051030898506728,-7.744423230554592 +250,-10.116651692250501,-8.275211300531558 +251,-8.202749106955345,-6.811801129305094 +252,-8.967894951789217,-8.509902072896494 +253,0.488437835066031,9.44417968077311 +254,-0.25003681933843014,8.865455583137448 +255,-9.585236818018899,-6.645739396359231 +256,0.9766413557554834,11.752064783270438 +257,-9.034868112790015,-7.374528147394214 +258,2.5822045654186043,9.102988870451187 +259,-2.0778811776069945,-5.786283877066672 +260,1.5787413115793543,9.044736891113242 +261,-2.33485326917092,-6.061955751331998 +262,-3.2895770824811423,-5.034392291746466 +263,-10.081565262534639,-7.374197032305851 +264,-3.994431608464796,-6.0162453845407935 +265,-4.232758796887437,-6.426632957640464 +266,-10.498691473054258,-6.403791006064546 +267,-2.292209986812295,-7.250831946240652 +268,-10.76508425595502,-7.165069717499894 +269,-8.386687189897946,-8.769216122642224 +270,-5.27510041598469,-7.945133906351278 +271,-2.334510993460131,-7.3989680996483465 +272,1.284797722990889,9.064382388513113 +273,-6.168646964600059,-6.8219172657965785 +274,-3.92701169853442,-6.704196762275968 +275,-2.6619749106971726,-6.704202485296508 +276,1.1478911738167819,7.512416897879306 +277,3.2510022512727446,7.3539410653768025 +278,-7.315315871979378,-5.88246686985582 +279,2.1117845497483563,11.076505278766518 +280,3.639056472297903,9.467060612350386 +281,-4.166374993413476,-6.431310496109624 +282,-8.726577750428884,-7.345021586443768 +283,1.6433220828690402,8.696077338068736 +284,1.8838280865620047,9.738013169059982 +285,-7.629361397843038,-7.55931022100682 +286,-7.912190771005928,-6.98790548068395 +287,-9.412741193816466,-8.791628726681072 +288,-9.74190411151045,-4.956844114033405 +289,-8.35965254228762,-7.669920828853046 +290,-2.801288589366356,-5.399769916668381 +291,0.2601192833910215,10.467333876493818 +292,-11.025210031015783,-8.136674217017307 +293,-0.06674706099380368,10.2899750307263 +294,1.067776664155953,8.750462573983851 +295,0.5509278352071858,10.273515136192767 +296,-9.884243025797673,-9.28367482506358 +297,-7.872243069788768,-7.757767318695286 +298,0.8084074017233095,9.656234963082685 +299,0.7624874038465008,8.947917897098035 +300,-6.332133121004648,-6.785874331813723 +301,-4.429652056501128,-7.578014459631968 +302,-4.188584113595973,-4.934755810797555 +303,3.6791257547345637,8.31690127193822 +304,1.51966280310663,9.29788229347068 +305,3.2102902579616233,8.295945923007892 +306,-11.779303109403571,-7.6837457438194265 +307,-9.216883150382246,-7.163067385221396 +308,-4.249121038157665,-6.5463368597678535 +309,1.6312431011720003,8.94177175465414 +310,-9.06559350244003,-6.148320200131417 +311,-10.015959896487981,-7.709434635038423 +312,-7.173319331442961,-9.256992837135826 +313,-9.060920902092729,-7.303562314018536 +314,2.312700300664332,7.139243673442236 +315,1.5809938429950523,10.007422841453208 +316,-8.54445075456971,-8.41948970598796 +317,-8.965879693065672,-8.447110955650984 +318,-9.54302474414816,-7.420209367781994 +319,1.8744747391345913,7.97319136754416 +320,-3.9052002880555103,-8.238735799274453 +321,1.310281830658,8.228855547316034 +322,-8.64301324545217,-7.027923448109183 +323,2.805369886850809,8.842471053078325 +324,-8.649270035579416,-6.535989617502658 +325,-3.39049806833454,-8.950043708641422 +326,-4.684398595708872,-7.537100814540029 +327,-2.7612003380209966,-5.035406026117489 +328,-10.13877327966241,-8.114347510336723 +329,-6.8544350435639885,-10.03456862183058 +330,-3.4750669023257132,-6.82654679089782 +331,-9.633501263903536,-9.055562467014843 +332,0.9048609502183895,11.673092427259427 +333,-4.2996488273011675,-6.010824368610892 +334,-8.771337671565963,-7.995483617420683 +335,1.6839223227818167,9.237547972212987 +336,-2.671508715890748,-6.069009146147523 +337,-4.397948091647258,-6.587398168180368 +338,-7.139508129041156,-7.821464761648635 +339,-9.475778540263686,-9.243996319844927 +340,-9.959314284171004,-7.285005944638585 +341,-7.839385017571539,-7.0330905675258295 +342,-7.868893904638215,-8.353276539871395 +343,1.3211888119472888,8.412687232783561 +344,2.6800131248732386,8.32815336253785 +345,-6.857701664404774,-6.452673448184665 +346,3.713139711261232,8.725335269085823 +347,-9.107446210080008,-7.921545298732012 +348,1.5893755906533062,9.98187640608331 +349,-10.159372029770498,-8.819216585856166 +350,1.4574743605281437,8.149245911727022 +351,-8.871582388403974,-8.559470119720018 +352,2.2751058610991945,9.901232247753237 +353,-2.7472542355419822,-6.124315203745871 +354,-2.656511821980372,-7.568181529571761 +355,3.0291019009630196,8.963009764693098 +356,-4.161637206516404,-6.81038562883428 +357,2.4730627290950755,9.091190792250169 +358,1.1200833219060131,7.833226207919465 +359,-9.070217965415747,-8.54140715137701 +360,1.0982393965124597,10.043973011835417 +361,-4.703066488892138,-5.423187161551743 +362,-3.357780102370316,-7.157938971939569 +363,-3.8242194656559905,-7.918767533053278 +364,1.1317031057022624,7.430132519964122 +365,-3.1464889947369326,-6.132635881498026 +366,-3.58162916465379,-6.68711506421579 +367,1.1996135191507407,9.006723457478385 +368,-9.45763051539227,-6.400190829645651 +369,-9.014623623797148,-6.574091480997232 +370,2.333568958298026,7.5107128468109 +371,-9.268111132437175,-7.69591073961429 +372,2.5920778736471224,6.5826104754205 +373,-1.225882906950281,-5.666669443257231 +374,-8.325853476844289,-7.793387810859259 +375,0.5055059451451487,8.966183931370773 +376,-4.978226962396014,-6.541022557172996 +377,-8.612561840486487,-5.163326887625027 +378,-9.271682378408538,-7.475451620888086 +379,-8.27293191600384,-6.776477142175483 +380,-9.006055108715797,-7.844080334639579 +381,-3.5485851032861455,-7.428944065491923 +382,-10.686134323492006,-9.763820650368935 +383,-3.0800097299068763,-8.15854472578761 +384,-4.255691548792045,-8.38292415298679 +385,-3.307254052685989,-7.470134723775772 +386,-9.321496507840077,-6.842675169149333 +387,-4.7321768780160465,-6.822753330631313 +388,2.230652092511753,9.209909160266504 +389,-10.199387433348697,-7.21773120699181 +390,-3.4127519199251655,-5.185028443964861 +391,-3.8416188751016542,-7.533373329866099 +392,2.1132609712972874,9.020452298425571 +393,-0.5325780055907829,8.484331396616149 +394,1.8445158417826104,8.81605756196587 +395,2.3584336367199685,9.97789937430074 +396,-3.568501887826641,-5.088851488135153 +397,-9.568684348087837,-5.253772154590987 +398,1.6852283040068348,9.30206036715821 +399,2.1342159865596315,8.910948360147488 +400,-6.629342939316992,-7.850706578446751 +401,2.0937767705468238,10.684273022018974 +402,-9.676217713369718,-7.313906906384972 +403,1.9570383690447184,10.003520022659853 +404,-8.939965060567271,-7.898097303618171 +405,1.4808132185460936,7.999137784006351 +406,2.3194719866422937,9.036137023298723 +407,-8.835902135605371,-7.052629529645275 +408,-8.238054814755314,-9.249598787232223 +409,-4.660901299028566,-7.706916900944542 +410,0.3783309638984087,8.290738393471463 +411,-5.015776514744971,-6.519066632817204 +412,-8.421053394915008,-8.736718539801242 +413,-4.435649194270888,-6.696459121403307 +414,2.247926822227134,9.57329133500239 +415,2.0222130431499963,6.975811120753269 +416,-10.306964876414497,-8.736415116583338 +417,-9.709449108929526,-5.153622128090238 +418,-4.017799127595082,-6.995636771418465 +419,-10.076959087536602,-6.21256389676847 +420,2.659238286258088,8.806459088474094 +421,-3.548608592940656,-6.236718753216755 +422,-4.62073236673274,-5.675859560710455 +423,1.5425505794868228,10.321222555804594 +424,-4.704008819633645,-6.048358572352397 +425,-4.379166498786982,-6.027328721967298 +426,-2.1851097524355154,-5.4972225996114386 +427,-1.5981643079912309,-7.268515428709705 +428,2.6828651920893583,7.562270572067991 +429,-5.0291400786305225,-6.489316980749896 +430,-8.90568480456124,-7.233692326868969 +431,-5.004939817646501,-6.66263935533808 +432,-3.710438396645244,-7.47145550649408 +433,-8.431518653354582,-6.547806268928545 +434,-4.961399314441151,-5.517757584761213 +435,-4.424875928292114,-4.935088672171116 +436,-8.465630223843888,-7.526658276960419 +437,1.1232273055419215,8.848543927546967 +438,1.6596342752172086,9.141054895640758 +439,0.7119140054568496,8.724208673911308 +440,1.0450609455563007,10.255952934802293 +441,-4.467548041647252,-5.754908323737896 +442,0.7968199534070038,9.810017983913482 +443,-4.635062452279791,-6.319914205163661 +444,1.1557559471601326,9.393630939514255 +445,-7.2189693831757324,-8.996252323908598 +446,-4.04121798967872,-5.9680578737510555 +447,-9.280259016217148,-7.344296039493729 +448,0.12990583915647091,9.534366746563984 +449,-9.080348829541355,-6.177346713521466 +450,-4.122544048604146,-7.424318232165486 +451,-8.523822110731878,-7.273222957182741 +452,2.4144759026265117,10.149542222925607 +453,-4.264806434701379,-7.993804495829595 +454,-8.843006328078824,-7.25237245189648 +455,-7.49037786278734,-7.149099301981595 +456,-5.072992400771182,-7.044488347703606 +457,-8.984565831557692,-7.556584668377714 +458,2.557179024476171,7.829957672041701 +459,-8.734624814022123,-7.572386626294366 +460,1.2351440435139123,9.43909831047302 +461,-3.3594653020514693,-8.995232214949041 +462,-9.701004421599954,-8.024999268149845 +463,-3.911609241339809,-6.404433416805523 +464,-2.643065906781685,-7.153288162400494 +465,-4.362786615686318,-8.81772389317671 +466,1.5120081296131147,9.911223821403949 +467,-8.531551914301003,-10.090011710251812 +468,-6.023729176014786,-7.112820404050528 +469,-2.5985519868592517,-6.4999945557865235 +470,-8.83500699255377,-7.718829054623505 +471,-5.75919608049208,-8.762533328160302 +472,-4.156775252969363,-5.923221304091207 +473,-0.382966756977273,6.909940417424705 +474,-3.7143796750446025,-5.330378058802522 +475,-4.931516866558665,-6.9273383758445854 +476,-8.192359305960302,-7.966452982690816 +477,-3.725534195535071,-6.923575094320778 +478,-1.6567493690717532,8.068007354960876 +479,-2.146889726406772,-5.438731120358826 +480,1.6640559067378071,9.675773158258412 +481,0.18778248663288855,8.428880250673016 +482,1.2712054384816953,9.5936730208319 +483,-9.777477551079443,-7.756277642160135 +484,1.2220274964294986,7.937710345285722 +485,-9.798771689602779,-7.310089107176487 +486,-2.5769948606619275,-5.236907832855168 +487,-3.280039956630294,-5.921645113967604 +488,-10.385438473456237,-8.978699614287862 +489,-3.538238933516765,-7.776971810237942 +490,-2.370442677763312,-7.492491016579457 +491,-9.22774914798459,-8.210795315347276 +492,-3.5970918987451044,-6.287436181495865 +493,-8.517183803198076,-6.662349844790031 +494,-8.287708904391959,-8.15649593679152 +495,-9.135473938940752,-7.855739524229387 +496,-8.669299782592173,-8.170797467143105 +497,-10.005994810578429,-7.936401567132531 +498,-8.544223415826707,-7.8982358071396135 +499,1.380211293463183,7.933133954036769 +500,-7.9655997110381795,-6.941204946690403 +501,-3.465161569489663,-6.077641920307408 +502,1.83638322311871,8.903049115882464 +503,4.120788211597215,9.110648038878697 +504,-2.794003193070597,-6.345799773867606 +505,-3.8239238176980557,-9.107238271082618 +506,0.7875933191763557,11.184572255529599 +507,-9.252786356666299,-6.843383729987514 +508,-7.998189201610247,-8.415058408832383 +509,-8.16094072563349,-7.583334007895792 +510,-11.05487333596034,-7.736249582486785 +511,1.8437259315731145,8.988020831895229 +512,-8.699105002659575,-5.307239816191379 +513,-2.466750766070521,-6.512870969238753 +514,3.410874103126626,7.867497335793834 +515,-9.752795331013969,-8.67435328723578 +516,-8.466944781470925,-6.791226698236835 +517,-10.616759196450891,-6.880881334143791 +518,0.43176792520881224,9.550687380593555 +519,3.736985587600425,8.853374927229641 +520,-4.532165464595547,-7.339471392762515 +521,-4.0812752157406305,-4.625339795130051 +522,-2.834654789895266,-5.783233762871275 +523,3.1267617202318103,8.886048721823393 +524,2.5997957103384843,9.835592180820424 +525,-2.8382644386960774,-6.663377478979936 +526,-4.883073540738554,-4.148753193485065 +527,-4.441561297964279,-4.69256538859622 +528,-0.3798796161990645,8.96320635596488 +529,-3.7109403060725477,-8.147828020606191 +530,-10.452756650029135,-8.807450425963179 +531,-9.820558778263809,-7.31072935827564 +532,-0.42268912766815525,8.631901499535282 +533,-4.093024424548601,-6.675411350292116 +534,-4.254640033558407,-6.478024511999712 +535,3.0761166422810886,11.077524125411133 +536,2.25594594426518,9.071468737291259 +537,-10.114562377154575,-6.719087017466592 +538,3.205132879039055,7.497445881908755 +539,-10.267700148068077,-6.713722100232483 +540,-10.417438485978314,-8.465832867634974 +541,-2.5164847102881867,-5.242531699126268 +542,-2.698794657384064,-6.103109872551561 +543,3.8165342813345537,8.841505234091182 +544,-10.227608589045904,-8.195906924722983 +545,-3.663699867237535,-8.27674770897949 +546,-3.261706481959633,-6.159224936936016 +547,-4.334510759437645,-5.956654161389583 +548,0.6999551100635917,9.42591422972624 +549,-9.151125616059769,-7.0551031934317905 +550,-9.223902039701933,-6.860062637838487 +551,-7.4615598493100865,-8.528019264479758 +552,-2.4380734875462897,-7.507277903677772 +553,-4.026642611221013,-5.110849404862844 +554,-9.389621272355004,-9.397979778210841 +555,1.7150238377255527,9.620244522910358 +556,-11.309469090090785,-7.9345599863656355 +557,-1.508483049723969,-5.969192439095569 +558,1.5454178016331435,9.049535083232207 +559,0.6647835495149763,10.331377822257668 +560,-10.500536041382844,-8.146345516551253 +561,-3.9299577253687468,-6.9635497993940065 +562,0.8411312844844993,8.368249720855683 +563,2.9506985987058885,10.331232109962894 +564,1.0316182325791265,8.770754440676154 +565,-4.312763375043856,-7.412307169564694 +566,3.1563371086285135,10.395086485340384 +567,2.002138375134201,8.547140346608217 +568,1.6923235633247864,7.353260846660685 +569,2.473672440958805,9.136461866113027 +570,-10.40814549184069,-9.118682041635388 +571,-4.065550600052492,-6.905831922608004 +572,0.2922467951737049,9.546441508105172 +573,1.5891585983241838,8.18355792173308 +574,1.4248639028982657,9.728308665738702 +575,-3.9669243990361025,-7.319350698393833 +576,-8.265092850484752,-7.42655563395339 +577,1.7895143670171882,7.819868806232385 +578,2.1336669469904095,9.948097835862201 +579,2.421396986118362,7.483100182629542 +580,-8.861580320724723,-5.474171763623955 +581,-8.90503421798681,-7.311777925305353 +582,-8.436633589948826,-8.523758539750995 +583,-5.049611580771868,-7.364445627614865 +584,0.7073023937231189,6.509679270754108 +585,1.2552894767144336,6.843131169029357 +586,-4.003962548081533,-9.027134085227624 +587,-4.49821963772513,-7.656193217995114 +588,-4.4339247440343454,-5.309484171372889 +589,1.2819691769435653,7.412341364219434 +590,-4.450143227312109,-6.59708814346974 +591,-4.499151353997774,-4.982990326147527 +592,-9.965975298915485,-7.638169628639451 +593,2.7968100908721762,9.0415309589172 +594,-3.4947283373962827,-5.86149606270687 +595,0.8206306411514246,9.044253333258009 +596,-5.021684732671406,-6.862529076101957 +597,1.5626206680018402,7.537251089399232 +598,-4.731861244222261,-8.35463204807969 +599,0.02102828690014613,10.047689525469467 diff --git a/Libraries/oneDAL/daal4py_Distributed_Kmeans/data/distributed_data/daal4py_Distributed_Kmeans_4.csv b/Libraries/oneDAL/daal4py_Distributed_Kmeans/data/distributed_data/daal4py_Distributed_Kmeans_4.csv new file mode 100755 index 0000000000..98ee92ecea --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_Kmeans/data/distributed_data/daal4py_Distributed_Kmeans_4.csv @@ -0,0 +1,601 @@ +,0,1 +0,0.10332010330206987,7.433057527844319 +1,-11.457080958126577,-6.770419344202086 +2,1.3549596047187271,7.700376028699749 +3,3.778150756893532,8.684061546713968 +4,-7.383885961653566,-8.618036742404994 +5,-0.39730229489662605,9.938300708523482 +6,0.5160441373354219,8.083982733069245 +7,-4.40642837789918,-4.6250083216575355 +8,-2.021319778039908,-7.629105587345337 +9,-8.69643931304748,-8.853871367088468 +10,-2.850870377910395,-7.749521305500871 +11,-3.8889488635523684,-7.407902098242759 +12,1.7829979167708168,10.567535943918909 +13,-3.519846055844178,-6.176139856637121 +14,1.5542403836477212,9.658471608302795 +15,0.4492306834259767,8.865118438070576 +16,-8.931885192609018,-6.637687003560089 +17,0.9532438492185421,8.939440062166604 +18,-4.732335952802864,-6.611045721670156 +19,1.2625408752904765,9.241926407970649 +20,-9.07086649044261,-8.134895879104807 +21,-8.036931586085002,-9.175845150648595 +22,3.11033574236728,9.120552071901347 +23,2.1275057983269905,8.28756053994324 +24,-2.3684112074313166,-5.886028660737004 +25,-8.964985592313594,-9.366382464393025 +26,-2.803686561279893,-8.069423213817318 +27,0.9129238925859261,8.650125187645022 +28,-11.08759019781802,-8.723907914360367 +29,-9.27887309508911,-6.765499402144199 +30,-4.765684514060551,-6.551347862368417 +31,-9.930360118635935,-7.083002016140143 +32,-8.211276870786092,-8.929597262859707 +33,0.8393549155970421,10.413232778704002 +34,1.486733016242737,7.202634962739694 +35,3.0575774682065946,7.482802373308799 +36,-8.564323295750087,-8.716844652132215 +37,-9.55725601240105,-7.310207215679147 +38,-8.901238507527413,-7.637897364610855 +39,-8.974367376520202,-8.55745587738768 +40,-4.320376872754786,-8.466550038397596 +41,-0.49258782785101607,10.716693498249228 +42,-4.6352245569022035,-5.336299582614836 +43,-9.099679594367988,-8.485814462290483 +44,-9.714631488909273,-5.22939458647355 +45,1.6501597558146126,8.430373326754374 +46,2.1405083621048875,9.1996702543802 +47,-5.149397117223147,-5.125656239754709 +48,-8.754121162974524,-9.53297997717857 +49,-4.175216584819035,-4.824835319920182 +50,-11.032739833239267,-6.190047430065487 +51,-3.3437310422037836,-4.953461730138913 +52,-4.903394237228955,-5.637326752057649 +53,0.635594613159898,7.2205676497228515 +54,-2.285295507751895,-5.067705899188625 +55,1.6287755069412115,8.090862721219649 +56,-8.488947479715254,-8.203791664131273 +57,1.6046322122303396,8.581881077986456 +58,-2.7451963356907845,-5.528097876400736 +59,1.3000010418976555,8.25061178560787 +60,-8.575068374111153,-7.743158262247462 +61,-3.0032342702015367,-6.879693561855583 +62,-2.3903311387601063,-6.617410057610613 +63,1.4914864078134888,8.385099919504881 +64,1.1475566710983691,9.255035510930956 +65,1.7011767872758812,9.468439392758494 +66,-3.723267948794898,-4.5334697958159484 +67,1.9338982595865435,9.075252553317402 +68,-9.454275646601058,-8.146236472154563 +69,-8.614498108394772,-7.930028618297809 +70,3.3058229783134343,8.985672226433286 +71,-7.453263421868005,-9.589499666243544 +72,-9.051131862237169,-9.040052762503546 +73,-8.332827063749622,-7.309261538789444 +74,-4.182313849871069,-5.646287098804299 +75,-9.548574515271493,-8.321835170547535 +76,2.7279018461305022,7.883227926925386 +77,-9.169419979477773,-8.179717957397253 +78,-2.9137491598286718,-4.946211982196632 +79,-4.576965064541776,-6.648301200185011 +80,-8.396218896667257,-6.182254078938222 +81,-4.064800602702525,-6.496690345077638 +82,-4.292125187261824,-3.8110544249712213 +83,1.080149538643875,8.208812212381472 +84,-8.14636036309612,-8.127046141511695 +85,-9.72981639348356,-6.485715224054457 +86,-9.9914366518976,-7.251547617157808 +87,2.430874145068557,9.622705735890477 +88,1.3088331161191342,8.273567356619495 +89,-9.552102393402203,-9.242234279567885 +90,-0.05324114522390522,8.161757992806596 +91,-8.939521887926572,-6.698579044615198 +92,1.2400373350855958,10.660273458611586 +93,1.0170328792607584,8.446784945275587 +94,2.8433776388033603,9.57957504440497 +95,1.9226053681179893,9.619604157171334 +96,-2.822978961795278,-6.577214478461163 +97,-0.744358881914156,9.602600862844417 +98,-3.2159513863608074,-6.205447638523938 +99,-3.3613726180816954,-6.015048326689677 +100,-1.028033098050419,-6.143185893865859 +101,-4.514068308320062,-8.77832980959645 +102,-10.621848014522701,-9.208719953912157 +103,-4.114696761001476,-5.8474981315029275 +104,1.4620500318015608,8.251448431036307 +105,-8.716980262412852,-7.401842999545523 +106,-10.480874347578267,-8.006256173493918 +107,-8.278854397415845,-6.956830111630074 +108,-3.301118306039926,-7.476786197915821 +109,2.3171065628512504,8.212345073816197 +110,1.710303138927003,8.813298496766798 +111,1.7608999943759904,8.196426898269944 +112,-9.049000425789696,-7.445395642780055 +113,3.3578958689069687,8.391684342227013 +114,-5.257799254544944,-6.640631405280244 +115,-10.243977422361972,-7.510371347855795 +116,-4.550134137239694,-5.856090796346537 +117,-3.7054788079266467,-6.680665785379875 +118,-5.099719716889362,-8.281616138774524 +119,-3.3630255444771007,-7.8152323872476686 +120,-5.053107949784717,-5.714018343138573 +121,-4.996014479508486,-6.867452649832137 +122,-9.79087311549495,-7.990696121215509 +123,-9.894952432863954,-5.919664972952979 +124,-7.998077960678973,-7.466145043480028 +125,-7.626417104394374,-7.8970534800882035 +126,-4.1881383908819405,-6.939057961967295 +127,2.34960172025711,7.739116063181076 +128,-3.1417708078401354,-7.629837732353748 +129,0.9289305865777172,9.988033988591333 +130,-2.164421767072346,-7.50649207969462 +131,0.41853000838584387,10.110931422144121 +132,0.7400807311324775,8.508373501577271 +133,-1.1364834746249484,-7.709871462628085 +134,-3.718475171665707,-7.322498643982633 +135,-8.56665079533079,-6.186519160575655 +136,2.747364757047606,8.91244845395835 +137,-3.7208995875130535,-5.025008292927113 +138,0.7392223115055112,9.315925989714465 +139,-1.3087755832537313,9.014376965679181 +140,-8.582365743135174,-7.856649895105258 +141,0.7993005612885375,7.917111342530812 +142,1.6563928814647313,9.617357563577665 +143,2.085824386959823,9.599755735558062 +144,1.5013443722140052,10.722417673062296 +145,2.1754135426933945,8.720714517175313 +146,2.3588382752863053,8.614914851805098 +147,1.211438801211038,8.42677120938315 +148,-1.5745761586324107,-7.183785235449116 +149,-2.964766675561709,-7.489589496345692 +150,-8.910820704195393,-6.919512593396113 +151,-8.9816280693271,-7.287428022339447 +152,1.6571173127857766,9.234840296892475 +153,-8.924784980289425,-8.025684926024711 +154,2.2686724726124075,8.308204858154415 +155,-8.200048485170736,-5.932219763863781 +156,1.8919238459888035,8.969986201271237 +157,-2.441642321765225,-7.100833966853025 +158,-7.033715194515465,-6.970109573816821 +159,-9.773935882768308,-5.724722764925062 +160,-8.601410625703446,-6.949078220901538 +161,2.5210681149799514,10.061607818392295 +162,-9.127335669111101,-8.3008763692738 +163,-9.709771657142020,-7.444212662130526 +164,-2.5700986302088538,-6.987966188150011 +165,-0.7702213291329225,7.574082930390704 +166,-4.4719799493922086,-5.409031535461088 +167,-3.750631879920109,-6.200685420473867 +168,-2.5302437103150535,-7.25712894315796 +169,0.8481158875951356,8.36938393164685 +170,0.18493531626255533,8.26694017721234 +171,-2.977230529674405,-6.680446391670111 +172,-5.882850122238738,-5.492789105806717 +173,2.296876263463775,10.533791321000571 +174,-4.157111089052275,-6.271131973108935 +175,-7.648711768400524,-7.5250019570491835 +176,-3.052565904860986,-7.950607586383728 +177,-0.11348035149120594,9.720185255137826 +178,-2.8593918126369857,-6.419098005011792 +179,-3.1105485621828706,-6.552547464236643 +180,1.6544164300532487,8.585517480127624 +181,-4.4613354967678065,-5.356868999473551 +182,-4.269920276314392,-5.279118977120863 +183,-1.9557780319538798,-5.854154911905707 +184,-9.498374262961063,-8.736327789643028 +185,-3.577803410517134,-6.3192637115085875 +186,-3.597871160549303,-6.181116722975659 +187,-4.177092821608165,-6.056579511534875 +188,-8.111083896430419,-8.277691411511093 +189,2.152045243993091,11.373438292614141 +190,1.0155641264093118,8.156438295352316 +191,-9.924268070310191,-6.630830852887261 +192,3.18360240206096,9.56007164758904 +193,-7.83078901744347,-7.692154085752587 +194,-3.6288843509443485,-7.329206450334352 +195,-4.469054874961147,-5.30409106500506 +196,-9.348956010813758,-7.344898106201228 +197,1.3866028257974798,9.45263507444789 +198,-3.2655886263343787,-7.534884911872669 +199,-4.099344992744675,-7.242477596850821 +200,2.109585403235858,8.082138286177194 +201,-2.6189700083390726,-4.293503587445704 +202,-0.5242851270504048,10.905414559024496 +203,-4.004655195359291,-5.475853674307111 +204,-9.432373582735602,-9.362717304474161 +205,1.4790975677294698,8.71509103766059 +206,3.1125907560604875,8.195264328734265 +207,-1.4908037421015017,-3.467285706166175 +208,-4.998111987077401,-8.12384626655243 +209,-2.959755619948657,-5.261842477246686 +210,-3.4688681427114894,-6.781982463262677 +211,-3.0564543070606556,-5.284221521163483 +212,-3.534702754309033,-6.805496549497049 +213,-4.74454012304712,-6.741814055899147 +214,-7.700936300263158,-8.60269464330942 +215,1.1791405914420183,8.612798281538595 +216,-5.245417900815677,-7.799109362884348 +217,-4.709874141302528,-6.272734950183002 +218,-7.477604407582303,-6.86011311708189 +219,1.4217403567022842,7.568046564888566 +220,-4.192264218725802,-6.954960331045015 +221,1.9807052384963193,8.660580460372561 +222,-3.627977418159731,-5.704035537448408 +223,-3.6160280365046327,-6.791686350240313 +224,-8.783511942118828,-7.686120915673679 +225,-3.1308372171133203,-5.742420381991395 +226,-5.52144948144214,-9.003088074639763 +227,-5.486164657388887,-6.98593906653643 +228,-4.156474586100949,-8.87506003747911 +229,-8.667564519104046,-6.744364238166 +230,2.1607196671238063,8.706563384883532 +231,1.3910987282344514,8.438017350017722 +232,-1.7846284549350655,-6.262239014702814 +233,-2.854713779303287,-7.0976957664158205 +234,-4.273746686743,-6.853167487164358 +235,-10.61782516124727,-8.558369357907177 +236,1.2187745068862288,8.110594277503289 +237,-3.217943234654223,-7.6804732209951165 +238,-3.9982576778797805,-7.505720398808794 +239,-9.92166044372743,-6.380166184966641 +240,-2.1104312634549798,-5.98678030303904 +241,-9.11078914175272,-8.502898695567145 +242,-4.404724148057582,-7.027673783311267 +243,1.4046346166894423,8.160547966311894 +244,1.8085737289713733,8.74066220723949 +245,0.28873537383188985,8.08140213929215 +246,-10.912806528631155,-7.739077680114303 +247,-10.467507832971958,-7.446586715066022 +248,2.859993801772723,8.238512486164346 +249,-3.3925299450589783,-7.214307850771746 +250,-4.741101632204018,-9.302406019691876 +251,0.6431995533037844,10.677337968545057 +252,-3.6834309946362778,-6.163620275639666 +253,-7.160588123680955,-7.999296011074655 +254,-8.406409357621815,-8.549109163011591 +255,-9.796003209558192,-7.779235768767059 +256,-7.785379338163239,-7.670928916501254 +257,2.402902221889004,9.178654547818025 +258,-3.9317733571520757,-6.888115412065522 +259,2.671849682572569,9.671854709207393 +260,-3.6831851093579417,-6.54062430384994 +261,-3.5530171441598974,-6.057034504236865 +262,-9.587474977945119,-6.5942830768698 +263,-8.331110781524744,-7.979672709454525 +264,-9.983737552654869,-7.020393931935233 +265,-4.877119889593441,-6.833803262685101 +266,-10.273359386544174,-9.152157876102272 +267,-10.462614802859681,-5.70811623533384 +268,2.3905941594302975,9.668424236419554 +269,-3.145674163416154,-6.925361659708869 +270,-8.772849374639128,-6.185526439282244 +271,-3.5401553464857303,-6.412623215630819 +272,-2.8308976895683333,-5.476156303082146 +273,-3.801975441372357,-8.17266581321063 +274,1.593478138989502,8.2283409057532 +275,1.1083736830525108,10.532946457515768 +276,-8.629021896198118,-7.288223300098911 +277,-4.244459466636461,-6.0044203274629355 +278,-8.358098042025627,-7.464441843191975 +279,0.5464259235576603,9.02654928587277 +280,3.2865148124029586,9.97054220915403 +281,1.6018509291466354,8.673197748734717 +282,1.4388645100714368,9.089227443346653 +283,-9.454027524648394,-6.637313223083979 +284,3.0590401094108373,7.858006030766688 +285,-7.526054504923103,-5.614511901600036 +286,1.2346751153152835,9.906582395613189 +287,-0.13016010738166028,9.36276717952036 +288,1.6353480069600483,8.883535422674692 +289,0.8501890864463089,8.704078528515916 +290,-3.6317772689427503,-6.04076375260842 +291,-8.553679599109792,-6.568814403766624 +292,-9.477619958416945,-9.055024624943108 +293,2.035675808379171,10.653350117273046 +294,-9.337402974469537,-5.728922196268389 +295,-11.31124421531365,-7.05723670724196 +296,-9.80698762368018,-8.375912447746234 +297,3.7373452335031443,9.321632220693063 +298,-4.011483512512924,-4.674986311903249 +299,-8.209537548523056,-8.356640196985037 +300,1.4749171979243114,10.028770530826403 +301,-7.972809098669269,-8.12381123289332 +302,2.2334887410176107,8.895114750863513 +303,-7.739791719131931,-6.214711879386799 +304,-8.70596283786291,-8.938921350708586 +305,0.8703213918463835,9.763785265754917 +306,-8.919286182232902,-8.584764544460372 +307,-8.11023140480786,-7.273682228244324 +308,-9.401987062380314,-9.103042776307934 +309,-9.689268054510414,-7.539979581889578 +310,-8.126157836206835,-6.778596427706855 +311,-4.0088121043655764,-5.356961822843008 +312,-4.236986989126743,-5.101553477004014 +313,-3.9533622236031305,-6.584825956670251 +314,-10.712363186343154,-9.907687244750802 +315,-4.045834408112561,-7.579783105822965 +316,1.5182309232334954,8.2963837721969 +317,-8.550430783632796,-6.5497827981160075 +318,1.6743982864281177,8.67229649069847 +319,1.4673441728410779,8.500504684629885 +320,-5.496968720655763,-6.3815348615910406 +321,-3.4320350436309432,-6.824549120125649 +322,-2.246472391300614,-6.128247237773868 +323,3.425436359090328,8.763192875496575 +324,-3.9917769596693673,-5.975766005177232 +325,-6.1147581304556935,-6.190005472690292 +326,1.3619645984107212,10.892414824558076 +327,-11.040559642110845,-8.212690211351045 +328,-2.174036274319838,-5.558392044639329 +329,-1.6633785293624905,-6.265876809324602 +330,-8.30217337077351,-9.488909011978945 +331,-7.845110222680548,-6.891976649453214 +332,-9.367428022519887,-6.432481825948209 +333,1.2776367034010043,8.419682111069969 +334,0.9213062898601015,8.777678054509824 +335,-8.527920863931795,-5.531575951306733 +336,-4.2241669725178435,-6.792482455826352 +337,1.7667613765414085,9.499948597371562 +338,2.175659723569063,9.344174814918937 +339,-3.5592061683626133,-6.675762326267785 +340,-3.5310059944137207,-6.483067194066182 +341,1.0749452601168323,8.650513425496522 +342,-5.143341686933297,-6.700188180597839 +343,1.1540938158736873,7.118490003220929 +344,-3.7099080347150095,-8.390944999854256 +345,-9.398266042099355,-7.595020573828141 +346,-3.32663089696943,-6.822234294782555 +347,0.06574279875473032,8.748278513754103 +348,2.8352533700966123,9.45075925759949 +349,3.5651458584407374,6.451249532135987 +350,-8.026546721361738,-6.381019100481007 +351,0.7519981107452905,8.300478310236869 +352,-0.7483423612814932,8.004091220777179 +353,1.347951038529854,9.43929152971695 +354,-3.4193954359078695,-6.467844026656739 +355,1.1573664822385257,8.80959176919287 +356,0.016123419703701014,10.230923549284014 +357,-8.443009865153714,-7.959899179528147 +358,-3.9249510992305585,-5.121258294863617 +359,-7.9397064008801275,-8.363044452201077 +360,-8.642116526299922,-5.006428787826313 +361,-4.145879952677833,-8.134904918495604 +362,-3.2457991193774727,-6.107013567899146 +363,-9.029744286835285,-6.036258976308376 +364,0.0065644783216172176,8.97018307243166 +365,-9.673086344121625,-6.591968603858424 +366,-3.759004993831228,-6.30231583740271 +367,-10.32317524410533,-7.5513698353789085 +368,2.4775183767038893,9.54338392728662 +369,-4.316155291672273,-5.196642968528018 +370,2.028997494940821,9.170247459695451 +371,-8.070898868532892,-7.460648283641237 +372,-10.061493844560447,-8.172322328081924 +373,2.304021158783734,8.351238594157417 +374,-4.900125772010646,-5.702253875931586 +375,-9.49837898421794,-9.207519030981446 +376,-3.5324222528590714,-7.587224174570254 +377,1.194306183664361,9.212586422860277 +378,-3.5369957622034076,-7.1052986574312955 +379,2.884330925702126,7.512347577176563 +380,-5.234794047090055,-9.855029455960205 +381,2.686699888092497,8.406337969272872 +382,1.4052712698315613,9.544165777014138 +383,3.0401246351813898,8.980268002406419 +384,-9.40180266225954,-7.469832906343165 +385,0.4529517325667254,8.989794896740293 +386,2.8024351458748833,9.804051065392992 +387,-9.27559686944533,-6.929625232934977 +388,2.737937181321032,8.547815544851781 +389,-3.3677928967889055,-5.891101470343494 +390,-9.856978945720277,-7.346416050852512 +391,-3.13748303886494,-5.747958319194765 +392,-8.026813755262788,-8.718162563492122 +393,-9.485468757044574,-8.768424261724446 +394,1.6249405527760306,9.540571251809629 +395,-2.6063334783739194,-6.956578993790879 +396,-4.84985808744287,-6.770874009149754 +397,-4.525907372363473,-6.1069568389530104 +398,-4.723989289922966,-5.660033704993747 +399,-10.13349528069687,-7.633357736272925 +400,-3.3869513833525042,-6.080778240391612 +401,-3.758108955113767,-8.234590752370567 +402,-4.269975499999625,-5.67628904605929 +403,3.6302030738962983,9.261585481262125 +404,-4.907342450755322,-6.442265391426007 +405,-9.716354868345709,-6.832040088737648 +406,-9.62155214151273,-8.144201874844567 +407,-3.7954904093471575,-4.387877920118073 +408,1.0840211539446736,9.91191638500914 +409,1.8864266102008502,9.427349758224441 +410,-5.6090921969231475,-6.808196491662715 +411,-8.765111618596253,-7.341162120979236 +412,-8.423016337331168,-8.175796863871167 +413,-3.8033475331495126,-7.135895145037555 +414,1.194645243023539,9.237249135567533 +415,2.1350904063644798,8.808176512652985 +416,-8.502877013804621,-5.838240491795165 +417,-3.6439747332939834,-7.0526876957594045 +418,-9.157601295562374,-8.62908788389255 +419,-10.32462697151651,-8.240160867274547 +420,-3.1976209294393048,-6.766212622136209 +421,0.22636579468006368,9.782266383411494 +422,-6.071101186595537,-6.334559114232327 +423,-4.13424363559873,-7.2230560364136815 +424,2.8866308846997635,9.48403941471309 +425,1.631738769356994,9.049283963637885 +426,-7.711497872027216,-6.918915081058153 +427,1.5945344992892474,8.355559947118106 +428,1.6105302308281981,8.92465974100905 +429,-8.900101678745584,-8.36971807214309 +430,-1.6751817498200121,-6.3104146250134825 +431,-2.894584684262049,-8.036841567603094 +432,1.796657968898645,7.948850883900677 +433,1.0478983997059044,9.363103162399927 +434,-2.8409450135907552,-6.171628297545874 +435,-2.227306347842321,-6.307491116034234 +436,1.6260369917293342,9.016427568754885 +437,-6.813430426503064,-7.644333527982396 +438,-4.462839299184512,-6.573060219011799 +439,0.1690909005821588,9.16580847024903 +440,-7.598677898814356,-7.0712033610174085 +441,-7.130545845445336,-8.707415522912568 +442,-8.934843243074333,-7.952078437140888 +443,-3.978099590488443,-6.628705569581859 +444,-2.0712704040859973,-5.601505609040887 +445,2.0107184297942395,8.780675845244515 +446,-3.612174159813057,-6.188615653153706 +447,2.210365580671883,8.016784178228079 +448,-4.714867081035019,-9.349071721468626 +449,-7.603898205496083,-8.606528934089987 +450,-4.060103843491486,-6.57141738920403 +451,3.0452962731722475,8.884983588793832 +452,1.392452429215557,9.793422837370947 +453,-9.709064683019628,-9.766253362570373 +454,-3.519895048653918,-4.50156998426432 +455,2.9581576907643146,9.90516142670879 +456,2.1006943315411624,10.852047777968103 +457,-3.740157806215068,-6.14909650979808 +458,-8.069207083924681,-6.857970269064291 +459,-1.599187821745982,-6.725930792173125 +460,-2.767838394170003,-7.581390024530235 +461,-4.883757201427826,-7.189390521384471 +462,1.6596127274734807,9.630361598946024 +463,-3.159475642662568,-6.692154385691226 +464,1.1998382168844066,9.75563664756965 +465,-9.38331058088836,-8.299734614250717 +466,-7.690211872752039,-6.338609710764013 +467,-7.595740118618205,-5.1514551068927075 +468,-8.453363823055831,-8.00455706914973 +469,3.4034179522453205,9.547176659378051 +470,-8.883897172481838,-7.043012176402381 +471,-9.24375628029471,-8.133943012104707 +472,3.030682510948817,6.618218462786886 +473,1.6003638054848441,8.658084851830768 +474,-8.155015798946494,-8.624187016721818 +475,-3.13659771045095,-7.28836362434932 +476,-10.242449146559304,-6.961456415847736 +477,1.6054337578913696,8.810540316186987 +478,-9.635539446409513,-6.765951710036209 +479,-8.577805905732227,-8.667280072120896 +480,2.886215426011436,8.982921132883831 +481,0.7428492084039351,7.738771003865294 +482,-9.196553247815073,-7.92827867725508 +483,-3.693772722581926,-7.005408387193325 +484,-5.852953942291274,-7.250235565635251 +485,-4.0273949389934405,-7.509024768532071 +486,-3.5495371738813892,-5.651485732968955 +487,3.1324197020855764,9.624251667101094 +488,-5.361297037860611,-4.843864066191982 +489,2.387823375106472,7.855001277929019 +490,4.294189895452808,8.247457224926999 +491,3.4935052421884745,7.742016632282266 +492,3.295763345159723,10.196403866251892 +493,0.25522282062584756,8.797612288356916 +494,-9.174138183617176,-6.689011630188279 +495,2.363426941545468,7.950471612301264 +496,-3.288847817689608,-7.442175509767312 +497,-6.14653267940901,-7.829939896149226 +498,-8.740032954828624,-9.306254941130103 +499,-3.027879646045245,-7.217619315047918 +500,-9.090040975217521,-7.731935852212176 +501,-2.7474414193326173,-6.609617700465485 +502,-8.466691586974017,-8.723660649567059 +503,0.8745756609720133,10.299831979302303 +504,-5.569382978924867,-5.871387110342048 +505,-5.060035529778957,-5.8637954937466334 +506,-4.194768378212306,-6.191284911206974 +507,1.9108114255589896,9.341347244142318 +508,2.757968314057243,8.068380353373065 +509,-9.457255140119786,-6.82228772868949 +510,-10.103743972207798,-6.163900492820808 +511,1.3654520919423347,7.430361875030972 +512,-10.539029327653,-7.048406703897319 +513,-9.610832138636368,-7.866206040717246 +514,-8.86913950421189,-7.165287966918405 +515,4.454529071472935,10.930805747352025 +516,-7.228541203936608,-7.801934124727664 +517,-7.960814643910588,-8.799585007463984 +518,-7.150102452454222,-5.934653293322399 +519,2.225873749663514,9.426840907394965 +520,-9.959393056256307,-6.695374583702613 +521,-10.291965829260349,-5.930930881957479 +522,2.2946809740902503,8.557590624852777 +523,-4.6837635934718325,-6.365077738131338 +524,-3.2203843310929776,-7.492467284541282 +525,-2.6251346193894634,-7.96861460076258 +526,-8.740940249070299,-8.725403642555193 +527,-3.2090921150933527,-4.150885397038803 +528,-8.095934459427891,-6.163405383251439 +529,-4.06640058664513,-7.59179736258314 +530,-8.986712346492256,-8.302193302573343 +531,-4.057358455778956,-6.376310776362724 +532,0.8597153790180284,9.971206639797476 +533,-10.238756492798004,-7.032909005136046 +534,-3.818802764142079,-7.117417145772932 +535,-9.387796060901183,-7.753538029825167 +536,-4.754370119268106,-7.41570935642935 +537,1.1377862575874307,7.858982216165167 +538,1.2033960393411534,8.370842929142466 +539,-6.682543227685962,-6.4354127077290535 +540,-4.376838486705878,-6.757929731118529 +541,1.4763144442194964,8.767287524041029 +542,-9.15903461658571,-7.743373236050885 +543,-4.635303453440217,-6.529589710059669 +544,3.807486502902972,7.601658904651514 +545,2.8230904569409025,8.751443168670802 +546,0.9299649702335456,8.830943517071482 +547,-8.691582658466626,-8.503413125974713 +548,-9.071621254951342,-8.710352942506473 +549,-4.049766712813901,-8.664499361565676 +550,1.8245463275173732,7.9761674099761635 +551,-3.3493823486024947,-7.493249748006302 +552,-6.989707010138683,-10.403483194439584 +553,-9.251915833519178,-7.366540058976776 +554,1.5493969049409129,6.616970097682397 +555,-8.387243825087705,-6.3584597244096575 +556,-5.419049608305913,-5.939452585406271 +557,-4.370029593497041,-5.91071909098837 +558,0.9312862912974271,8.829513721120074 +559,-3.4126342996730945,-6.793426480956431 +560,-4.297065951581121,-7.472319629223204 +561,1.1070493362854699,10.265920257932057 +562,-8.354902370372622,-8.759611704266433 +563,-1.1760744018566798,-7.368820263201566 +564,1.1412600947448923,7.2167488709494485 +565,-1.8406854826480157,-5.044577645545261 +566,1.0158097003284685,8.30291649430223 +567,-3.6295727926805164,-5.502430962352257 +568,-7.8031522894756815,-9.506751304080428 +569,1.0041110581320907,8.750171253145108 +570,-9.319926024044026,-7.733415511117693 +571,-5.486645481531933,-8.83303062916075 +572,-4.603566875937983,-7.7984975677448585 +573,-3.972435031417204,-5.622934143284206 +574,-4.6359557675552665,-5.907135244908342 +575,-9.982127344096675,-7.327077425734391 +576,-2.1826412023412667,-5.718481255870376 +577,1.6076158887039014,10.035529414003625 +578,-9.657866194455266,-6.346791617600089 +579,-7.586898689929213,-8.124987571516584 +580,1.593291201280905,8.715554965102461 +581,-2.916782935354813,-7.121178500724829 +582,2.6068939877583848,9.700345198182983 +583,-10.83923115580838,-8.755033803749605 +584,-10.897319142176741,-6.967850097530078 +585,-2.35304395343788,-7.675821778854565 +586,-4.284376713730921,-5.014144131177996 +587,-9.540278288389061,-8.878692618031153 +588,-3.718801600771222,-6.645133456613008 +589,-3.906793398728766,-7.412486938254312 +590,1.325668478051277,8.589775939517128 +591,1.760049645315195,9.871884524484415 +592,-3.376000557366496,-6.414735971787719 +593,-3.3968410194594814,-7.496309738719732 +594,-0.8635316869591434,8.470488027768871 +595,3.212842131889252,7.617877682770626 +596,-4.112611607209468,-6.708665111654143 +597,-5.824710556376033,-6.001569780543621 +598,-4.596295359433397,-5.458264749394884 +599,-4.345974396894074,-7.470598519372723 diff --git a/DirectProgramming/FPGA/.gitkeep b/Libraries/oneDAL/daal4py_Distributed_Kmeans/models/store_models_in_this_folder.txt old mode 100644 new mode 100755 similarity index 100% rename from DirectProgramming/FPGA/.gitkeep rename to Libraries/oneDAL/daal4py_Distributed_Kmeans/models/store_models_in_this_folder.txt diff --git a/Libraries/oneDAL/daal4py_Distributed_Kmeans/results/store_results_in_this_folder.txt b/Libraries/oneDAL/daal4py_Distributed_Kmeans/results/store_results_in_this_folder.txt new file mode 100755 index 0000000000..e69de29bb2 diff --git a/Libraries/oneDAL/daal4py_Distributed_Kmeans/sample.json b/Libraries/oneDAL/daal4py_Distributed_Kmeans/sample.json new file mode 100755 index 0000000000..ca263dac01 --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_Kmeans/sample.json @@ -0,0 +1,22 @@ +{ + "guid": "B69FAC86-88BF-41BD-B4E0-ACDF753ED3CE", + "name": "daal4py Distributed K-means", + "categories": ["Toolkit/Intel® AI Analytics Toolkit/oneDAL"], + "description": "This sample code shows how to train and predict with a distributed k-means model with the Intel Distribution of Python using the python API package daal4py for oneDAL", + "builder": ["cli"], + "languages": [{"python":{}}], + "dependencies": ["oneDAL"], + "os":["linux"], + "targetDevice": ["CPU"], + "ciTests": { + "linux": [ + { + "env": ["source /opt/intel/oneapi/setvars.sh --force", "source activate base"], + "id": "d4p_KM_Dist", + "steps": [ + "mpirun -n 4 python ./daal4py_Distributed_Kmeans.py" + ] + } + ] +} +} diff --git a/Libraries/oneDAL/daal4py_Distributed_LinearRegression/License.txt b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/License.txt new file mode 100755 index 0000000000..a3ab05efce --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/License.txt @@ -0,0 +1,8 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +© 2020 GitHub, Inc. \ No newline at end of file diff --git a/Libraries/oneDAL/daal4py_Distributed_LinearRegression/README.md b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/README.md new file mode 100755 index 0000000000..19a0633325 --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/README.md @@ -0,0 +1,119 @@ +# daal4py Distributed Linear Regression +This sample code shows how to train and predict with a distributed linear regression model using the python API package daal4py for oneAPI Data Analytics Library. It assumes you have a working version of MPI library installed and it demonstrates how to use software products that can be found in the [Intel oneAPI Data Analytics Library](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onedal.html) or [Intel AI Analytics Toolkit powered by oneAPI](https://software.intel.com/content/www/us/en/develop/tools/oneapi/ai-analytics-toolkit.html). + +| Optimized for | Description +| :--- | :--- +| OS | 64-bit Linux: Ubuntu 18.04 or higher, 64-bit Windows 10, macOS 10.14 or higher +| Hardware | Intel Atom® Processors; Intel® Core™ Processor Family; Intel® Xeon® Processor Family; Intel® Xeon® Scalable Performance Processor Family +| Software | oneDAL Software Library, Python version 2.7 or >= 3.6, conda-build version >= 3, C++ compiler with C++11 support, Pickle, Pandas, NumPy +| What you will learn | distributed oneDAL Linear Regression programming model for Intel CPU +| Time to complete | 5 minutes + +## Purpose + +daal4py is a simplified API to Intel® DAAL that allows for fast usage of the framework suited for Data Scientists or Machine Learning users. Built to help provide an abstraction to Intel® DAAL for either direct usage or integration into one's own framework. + +In this sample you will run a distributed Linear Regression model with oneDAL daal4py library memory objects. You will also learn how to train a model and save the information to a file. + +## Key Implementation Details +This distributed linear regression sample code is implemented for CPU using the Python language. The example assumes you have daal4py and scikit-learn installed inside a conda environment, similar to what is delivered with the installation of the Intel(R) Distribution for Python as part of the [oneAPI AI Analytics Toolkit](https://software.intel.com/en-us/oneapi/ai-kit). + + +## Additional Requirements +You will need a working MPI library. We recommend to use Intel(R) MPI, which is included in the [oneAPI HPC Toolkit](https://software.intel.com/en-us/oneapi/hpc-kit). + +## License +This code sample is licensed under MIT license + +## Building daal4py for CPU + +oneAPI Data Analytics Library is ready for use once you finish the Intel AI Analytics Toolkit installation, and have run the post installation script. + +You can refer to the oneAPI [main page](https://software.intel.com/en-us/oneapi) for toolkit installation, and the Toolkit [Getting Started Guide for Linux](https://software.intel.com/en-us/get-started-with-intel-oneapi-linux-get-started-with-the-intel-ai-analytics-toolkit) for post-installation steps and scripts. + + +### Activate conda environment With Root Access + +Please follow the Getting Started Guide steps (above) to set up your oneAPI environment with the setvars.sh script. Then navigate in linux shell to your oneapi installation path, typically `~/intel/inteloneapi`. Intel Python environment will be activte by default. However, if you activated another environment, you can return with the following command: + +#### On a Linux* System +``` +source activate base +``` + +### Activate conda environment Without Root Access (Optional) + +By default, the Intel AI Analytics toolkit is installed in the inteloneapi folder, which requires root privileges to manage it. If you would like to bypass using root access to manage your conda environment, then you can clone your desired conda environment using the following command: + +#### On a Linux* System +``` +conda create --name user_base --clone base +``` + +Then activate your conda environment with the following command: + +``` +source activate user_base +``` + +### Install Jupyter Notebook +``` +conda install jupyter nb_conda_kernels +``` + +#### View in Jupyter Notebook + +_Note: This distributed execution cannot be launched from the jupyter notebook version, but you can still view inside the notebook to follow the included write-up and description._ + +Launch Jupyter Notebook in the directory housing the code example + +``` +jupyter notebook +``` + +## Running the Sample + +### Running the Sample as a Python File + +When using daal4py for distributed memory systems, the command needed to execute the program should be executed in a bash shell. To execute this example, run the following command, where the number **4** is chosen as an example and means that it will run on **4 processes**: + +Run the Program + +`mpirun -n 4 python ./daal4py_Distributed_LinearRegression.py` + +The output of the script will be saved in the included models and results directories. + +_Note: This code samples focuses on how to use daal4py to do distributed ML computations on chunks of data. The `mpirun` command above will only run on single local node. In order to launch on a cluster, you will need to create a host file on the master node among other steps. The **TensorFlow_Multinode_Training_with_Horovod** code sample explains this process well._ + +##### Expected Printed Output (with similar numbers, printed 4 times): +``` + + +Here's our model: + + + NumberOfBetas: 15 + +NumberOfResponses: 1 + +InterceptFlag: False + +Beta: array( + [[ 0.00000000e+00 -3.20923431e-03 -1.06404233e-01 5.46052700e-02 + 2.86834741e-03 2.75997053e+00 -2.54371297e+00 5.52421949e+00 + 6.67604639e-04 -9.01293646e-01 1.96091421e-01 -7.50083536e-03 + -3.11567377e-01 1.58333298e-02 -4.62941338e-01]], + dtype=float64, shape=(1, 15)) + +NumberOfFeatures: 14 + +Here is one of our loaded model's features: + + [[ 0.00000000e+00 -3.20923431e-03 -1.06404233e-01 5.46052700e-02 + 2.86834741e-03 2.75997053e+00 -2.54371297e+00 5.52421949e+00 + 6.67604639e-04 -9.01293646e-01 1.96091421e-01 -7.50083536e-03 + -3.11567377e-01 1.58333298e-02 -4.62941338e-01]] +[CODE_SAMPLE_COMPLETED_SUCCESFULLY] + +``` + diff --git a/Libraries/oneDAL/daal4py_Distributed_LinearRegression/daal4py_Distributed_LinearRegression.ipynb b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/daal4py_Distributed_LinearRegression.ipynb new file mode 100755 index 0000000000..ea51a22f11 --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/daal4py_Distributed_LinearRegression.ipynb @@ -0,0 +1,285 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# =============================================================\n", + "# Copyright © 2020 Intel Corporation\n", + "# \n", + "# SPDX-License-Identifier: MIT\n", + "# =============================================================" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Daal4py Linear Regression Example for Distributed Memory Systems [SPMD mode]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## IMPORTANT NOTICE\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When using daal4py for distributed memory systems, the command needed to execute the program should be **executed \n", + "in a bash shell**. In order to run this example, please download it as a .py file then run the following command (**the number 4 means that it will run on 4 processes**):" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "mpirun -n 4 python ./daal4py_Distributed_LinearRegression.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Importing and Organizing Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example we will be predicting **prices of houses in Boston** based on the features of each house.\n", + "\n", + "Let's start by **importing** all necessary data and packages." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "##### daal4py linear regression example for distributed memory systems [SPMD mode] #####\n", + "import daal4py as d4p\n", + "from sklearn.datasets import load_boston\n", + "from sklearn.model_selection import train_test_split\n", + "import pandas as pd\n", + "import numpy as np\n", + "import pickle" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's **load** in the dataset and **organize** it as necessary to work with our model. For distributed, every file has a unique ID.\n", + "\n", + "We will also **initialize the distribution engine**." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "d4p.daalinit() #initializes the distribution engine\n", + "\n", + "# organizing variables used in the model for prediction\n", + "# each process gets its own data\n", + "infile = \"./data/distributed_data/linear_regression_train_\" + str(d4p.my_procid()+1) + \".csv\"\n", + "\n", + "# read data\n", + "indep_data = pd.read_csv(infile).drop([\"target\"], axis=1) # house characteristics\n", + "dep_data = pd.read_csv(infile)[\"target\"] # house price" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training and Saving the Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Time to **train our model** and look at the model's features! " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# training the model for prediction\n", + "train_result = d4p.linear_regression_training(distributed=True).compute(indep_data, dep_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To **get training model information** and **save it to a file**:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Here's our model:\n", + "\n", + "\n", + " NumberOfBetas: 15\n", + "\n", + "NumberOfResponses: 1\n", + "\n", + "InterceptFlag: False\n", + "\n", + "Beta: array(\n", + " [[ 0.00000000e+00 -1.68027665e-04 -7.40435666e-02 3.72706786e-02\n", + " -1.32246207e-01 5.24821226e+00 -2.09646770e+00 6.15919748e+00\n", + " -1.17193612e-03 -8.86515999e-01 2.23344092e-02 -1.09556173e-03\n", + " -4.40967972e-01 1.12216533e-02 -4.74953243e-01]],\n", + " dtype=float64, shape=(1, 15))\n", + "\n", + "NumberOfFeatures: 14 \n", + "\n" + ] + } + ], + "source": [ + "# retrieving and printing training model\n", + "model = train_result.model\n", + "print(\"Here's our model:\\n\\n\\n\",model , \"\\n\")\n", + "\n", + "model_filename = './models/daal4py_Distributed_LinearRegression_' + str(d4p.my_procid()+1) + '.sav'\n", + "\n", + "# saving model to a file\n", + "pickle.dump(model, open(model_filename, \"wb\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's **load up the model** and look at one of the model's features." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Here is one of our loaded model's features: \n", + "\n", + " [[ 0.00000000e+00 -1.68027665e-04 -7.40435666e-02 3.72706786e-02\n", + " -1.32246207e-01 5.24821226e+00 -2.09646770e+00 6.15919748e+00\n", + " -1.17193612e-03 -8.86515999e-01 2.23344092e-02 -1.09556173e-03\n", + " -4.40967972e-01 1.12216533e-02 -4.74953243e-01]]\n" + ] + } + ], + "source": [ + "# loading the training model from a file\n", + "loaded_model = pickle.load(open(model_filename, \"rb\"))\n", + "print(\"Here is one of our loaded model's features: \\n\\n\",loaded_model.Beta)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Making a Prediction and Saving the Results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Time to **make a prediction!**" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# read test data\n", + "test_data = pd.read_csv(\"./data/distributed_data/linear_regression_test.csv\").drop([\"target\"], axis=1)\n", + "\n", + "# now predict using the model from the training above\n", + "predict_result = d4p.linear_regression_prediction().compute(test_data, train_result.model).prediction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's **export the results to a CSV file**. We will also **stop the distribution engine.**" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CODE_SAMPLE_COMPLETED_SUCCESFULLY]\n" + ] + } + ], + "source": [ + "# now export the results to a CSV file\n", + "results_filename = \"./results/daal4py_Distributed_LinearRegression_results\" + str(d4p.my_procid()+1) + \".csv\"\n", + "np.savetxt(results_filename, predict_result, delimiter = \",\")\n", + "\n", + "d4p.daalfini() # stops the distribution engine\n", + "print(\"[CODE_SAMPLE_COMPLETED_SUCCESFULLY]\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Libraries/oneDAL/daal4py_Distributed_LinearRegression/daal4py_Distributed_LinearRegression.py b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/daal4py_Distributed_LinearRegression.py new file mode 100755 index 0000000000..820afbafa3 --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/daal4py_Distributed_LinearRegression.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: + + +''' +============================================================= +Copyright © 2020 Intel Corporation + +SPDX-License-Identifier: MIT +============================================================= +''' + +# # Daal4py Linear Regression Example for Distributed Memory Systems [SPMD mode] + +# ## IMPORTANT NOTICE +# + +# When using daal4py for distributed memory systems, the command needed to execute the program should be **executed +# in a bash shell**. In order to run this example, please download it as a .py file then run the following command (**the number 4 means that it will run on 4 processes**): + +# mpirun -n 4 python ./daal4py_Distributed_LinearRegression.py + +# ## Importing and Organizing Data + +# In this example we will be predicting **prices of houses in Boston** based on the features of each house. +# +# Let's start by **importing** all necessary data and packages. + +# In[2]: + + +##### daal4py linear regression example for distributed memory systems [SPMD mode] ##### +import daal4py as d4p +from sklearn.datasets import load_boston +from sklearn.model_selection import train_test_split +import pandas as pd +import numpy as np +import pickle + + +# Now let's **load** in the dataset and **organize** it as necessary to work with our model. For distributed, every file has a unique ID. +# +# We will also **initialize the distribution engine**. + +# In[3]: + + +d4p.daalinit() #initializes the distribution engine + +# organizing variables used in the model for prediction +# each process gets its own data +infile = "./data/distributed_data/linear_regression_train_" + str(d4p.my_procid()+1) + ".csv" + +# read data +indep_data = pd.read_csv(infile).drop(["target"], axis=1) # house characteristics +dep_data = pd.read_csv(infile)["target"] # house price + + +# ## Training and Saving the Model + +# Time to **train our model** and look at the model's features! + +# In[4]: + + +# training the model for prediction +train_result = d4p.linear_regression_training(distributed=True).compute(indep_data, dep_data) + + +# To **get training model information** and **save it to a file**: + +# In[5]: + + +# retrieving and printing training model +model = train_result.model +print("Here's our model:\n\n\n",model , "\n") + +model_filename = './models/daal4py_Distributed_LinearRegression_' + str(d4p.my_procid()+1) + '.sav' + +# saving model to a file +pickle.dump(model, open(model_filename, "wb")) + + +# Now let's **load up the model** and look at one of the model's features. + +# In[6]: + + +# loading the training model from a file +loaded_model = pickle.load(open(model_filename, "rb")) +print("Here is one of our loaded model's features: \n\n",loaded_model.Beta) + + +# ## Making a Prediction and Saving the Results + +# Time to **make a prediction!** + +# In[9]: + + +# read test data +test_data = pd.read_csv("./data/distributed_data/linear_regression_test.csv").drop(["target"], axis=1) + +# now predict using the model from the training above +predict_result = d4p.linear_regression_prediction().compute(test_data, train_result.model).prediction + + +# Now let's **export the results to a CSV file**. We will also **stop the distribution engine.** + +# In[10]: + + +# now export the results to a CSV file +results_filename = "./results/daal4py_Distributed_LinearRegression_results" + str(d4p.my_procid()+1) + ".csv" +np.savetxt(results_filename, predict_result, delimiter = ",") + +d4p.daalfini() # stops the distribution engine +print('[CODE_SAMPLE_COMPLETED_SUCCESFULLY]') + diff --git a/Libraries/oneDAL/daal4py_Distributed_LinearRegression/data/distributed_data/linear_regression_test.csv b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/data/distributed_data/linear_regression_test.csv new file mode 100755 index 0000000000..3f06e11969 --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/data/distributed_data/linear_regression_test.csv @@ -0,0 +1,128 @@ +,0,1,2,3,4,5,6,7,8,9,10,11,12,target +357,3.8497,0.0,18.1,1.0,0.77,6.395,91.0,2.5052,24.0,666.0,20.2,391.34,13.27,21.7 +236,0.52058,0.0,6.2,1.0,0.507,6.631,76.5,4.148,8.0,307.0,17.4,388.45,9.54,25.1 +262,0.52014,20.0,3.97,0.0,0.647,8.398,91.5,2.2885,5.0,264.0,13.0,386.86,5.91,48.8 +168,2.3004,0.0,19.58,0.0,0.605,6.319,96.1,2.1,5.0,403.0,14.7,297.09,11.1,23.8 +102,0.22876,0.0,8.56,0.0,0.52,6.405,85.4,2.7147,5.0,384.0,20.9,70.8,10.63,18.6 +33,1.15172,0.0,8.14,0.0,0.538,5.701,95.0,3.7872,4.0,307.0,21.0,358.77,18.35,13.1 +475,6.39312,0.0,18.1,0.0,0.584,6.162,97.4,2.206,24.0,666.0,20.2,302.76,24.1,13.3 +376,15.288,0.0,18.1,0.0,0.671,6.649,93.3,1.3449,24.0,666.0,20.2,363.02,23.24,13.9 +285,0.01096,55.0,2.25,0.0,0.389,6.453,31.9,7.3073,1.0,300.0,15.3,394.72,8.23,22.0 +179,0.0578,0.0,2.46,0.0,0.488,6.98,58.4,2.829,3.0,193.0,17.8,396.9,5.04,37.2 +315,0.25356,0.0,9.9,0.0,0.544,5.705,77.7,3.945,4.0,304.0,18.4,396.42,11.5,16.2 +175,0.06664,0.0,4.05,0.0,0.51,6.546,33.1,3.1323,5.0,296.0,16.6,390.96,5.33,29.4 +223,0.6147,0.0,6.2,0.0,0.507,6.618,80.8,3.2721,8.0,307.0,17.4,396.9,7.6,30.1 +309,0.3494,0.0,9.9,0.0,0.544,5.972,76.7,3.1025,4.0,304.0,18.4,396.24,9.97,20.3 +137,0.35233,0.0,21.89,0.0,0.624,6.454,98.4,1.8498,4.0,437.0,21.2,394.08,14.59,17.1 +226,0.38214,0.0,6.2,0.0,0.504,8.04,86.5,3.2157,8.0,307.0,17.4,387.38,3.13,37.6 +328,0.06617,0.0,3.24,0.0,0.46,5.868,25.8,5.2146,4.0,430.0,16.9,382.44,9.97,19.3 +416,10.8342,0.0,18.1,0.0,0.679,6.782,90.8,1.8195,24.0,666.0,20.2,21.57,25.79,7.5 +409,14.4383,0.0,18.1,0.0,0.597,6.852,100.0,1.4655,24.0,666.0,20.2,179.36,19.78,27.5 +84,0.05059,0.0,4.49,0.0,0.449,6.389,48.0,4.7794,3.0,247.0,18.5,396.9,9.62,23.9 +186,0.05602,0.0,2.46,0.0,0.488,7.831,53.6,3.1992,3.0,193.0,17.8,392.63,4.45,50.0 +133,0.32982,0.0,21.89,0.0,0.624,5.822,95.4,2.4699,4.0,437.0,21.2,388.69,15.03,18.4 +230,0.537,0.0,6.2,0.0,0.504,5.981,68.1,3.6715,8.0,307.0,17.4,378.35,11.65,24.3 +454,9.51363,0.0,18.1,0.0,0.713,6.728,94.1,2.4961,24.0,666.0,20.2,6.68,18.71,14.9 +7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15,27.1 +117,0.15098,0.0,10.01,0.0,0.547,6.021,82.6,2.7474,6.0,432.0,17.8,394.51,10.3,19.2 +214,0.28955,0.0,10.59,0.0,0.489,5.412,9.8,3.5875,4.0,277.0,18.6,348.93,29.55,23.7 +15,0.62739,0.0,8.14,0.0,0.538,5.834,56.5,4.4986,4.0,307.0,21.0,395.62,8.47,19.9 +136,0.32264,0.0,21.89,0.0,0.624,5.942,93.5,1.9669,4.0,437.0,21.2,378.25,16.9,17.4 +388,14.3337,0.0,18.1,0.0,0.7,4.88,100.0,1.5895,24.0,666.0,20.2,372.92,30.62,10.2 +178,0.06642,0.0,4.05,0.0,0.51,6.86,74.4,2.9153,5.0,296.0,16.6,391.27,6.92,29.9 +95,0.12204,0.0,2.89,0.0,0.445,6.625,57.8,3.4952,2.0,276.0,18.0,357.98,6.65,28.4 +495,0.17899,0.0,9.69,0.0,0.585,5.67,28.8,2.7986,6.0,391.0,19.2,393.29,17.6,23.1 +53,0.04981,21.0,5.64,0.0,0.439,5.998,21.4,6.8147,4.0,243.0,16.8,396.9,8.43,23.4 +131,1.19294,0.0,21.89,0.0,0.624,6.326,97.7,2.271,4.0,437.0,21.2,396.9,12.26,19.6 +392,11.5779,0.0,18.1,0.0,0.7,5.036,97.0,1.77,24.0,666.0,20.2,396.9,25.68,9.7 +25,0.84054,0.0,8.14,0.0,0.538,5.599,85.7,4.4546,4.0,307.0,21.0,303.42,16.51,13.9 +302,0.09266,34.0,6.09,0.0,0.433,6.495,18.4,5.4917,7.0,329.0,16.1,383.61,8.67,26.4 +199,0.0315,95.0,1.47,0.0,0.403,6.975,15.3,7.6534,3.0,402.0,17.0,396.9,4.56,34.9 +210,0.17446,0.0,10.59,1.0,0.489,5.96,92.1,3.8771,4.0,277.0,18.6,393.25,17.27,21.7 +292,0.03615,80.0,4.95,0.0,0.411,6.63,23.4,5.1167,4.0,245.0,19.2,396.9,4.7,27.9 +290,0.03502,80.0,4.95,0.0,0.411,6.861,27.9,5.1167,4.0,245.0,19.2,396.9,3.33,28.5 +128,0.32543,0.0,21.89,0.0,0.624,6.431,98.8,1.8125,4.0,437.0,21.2,396.9,15.39,18.0 +489,0.18337,0.0,27.74,0.0,0.609,5.414,98.3,1.7554,4.0,711.0,20.1,344.05,23.97,7.0 +367,13.5222,0.0,18.1,0.0,0.631,3.863,100.0,1.5106,24.0,666.0,20.2,131.42,13.33,23.1 +122,0.09299,0.0,25.65,0.0,0.581,5.961,92.9,2.0869,2.0,188.0,19.1,378.09,17.93,20.5 +171,2.3139,0.0,19.58,0.0,0.605,5.88,97.3,2.3887,5.0,403.0,14.7,348.13,12.03,19.1 +405,67.9208,0.0,18.1,0.0,0.693,5.683,100.0,1.4254,24.0,666.0,20.2,384.97,22.98,5.0 +225,0.52693,0.0,6.2,0.0,0.504,8.725,83.0,2.8944,8.0,307.0,17.4,382.0,4.63,50.0 +228,0.29819,0.0,6.2,0.0,0.504,7.686,17.0,3.3751,8.0,307.0,17.4,377.51,3.92,46.7 +162,1.83377,0.0,19.58,1.0,0.605,7.802,98.2,2.0407,5.0,403.0,14.7,389.61,1.92,50.0 +394,13.3598,0.0,18.1,0.0,0.693,5.887,94.7,1.7821,24.0,666.0,20.2,396.9,16.35,12.7 +461,3.69311,0.0,18.1,0.0,0.713,6.376,88.4,2.5671,24.0,666.0,20.2,391.43,14.65,17.7 +242,0.1029,30.0,4.93,0.0,0.428,6.358,52.9,7.0355,6.0,300.0,16.6,372.75,11.22,22.2 +55,0.01311,90.0,1.22,0.0,0.403,7.249,21.9,8.6966,5.0,226.0,17.9,395.93,4.81,35.4 +459,6.80117,0.0,18.1,0.0,0.713,6.081,84.4,2.7175,24.0,666.0,20.2,396.9,14.7,20.0 +286,0.01965,80.0,1.76,0.0,0.385,6.23,31.5,9.0892,1.0,241.0,18.2,341.6,12.93,20.1 +1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6 +440,22.0511,0.0,18.1,0.0,0.74,5.818,92.4,1.8662,24.0,666.0,20.2,391.45,22.11,10.5 +456,4.66883,0.0,18.1,0.0,0.713,5.976,87.9,2.5806,24.0,666.0,20.2,10.48,19.01,12.7 +157,1.22358,0.0,19.58,0.0,0.605,6.943,97.4,1.8773,5.0,403.0,14.7,363.43,4.59,41.3 +60,0.14932,25.0,5.13,0.0,0.453,5.741,66.2,7.2254,8.0,284.0,19.7,395.11,13.15,18.7 +111,0.10084,0.0,10.01,0.0,0.547,6.715,81.6,2.6775,6.0,432.0,17.8,395.59,10.16,22.8 +24,0.75026,0.0,8.14,0.0,0.538,5.924,94.1,4.3996,4.0,307.0,21.0,394.33,16.3,15.6 +20,1.25179,0.0,8.14,0.0,0.538,5.57,98.1,3.7979,4.0,307.0,21.0,376.57,21.02,13.6 +32,1.38799,0.0,8.14,0.0,0.538,5.95,82.0,3.99,4.0,307.0,21.0,232.6,27.71,13.2 +276,0.10469,40.0,6.41,1.0,0.447,7.267,49.0,4.7872,4.0,254.0,17.6,389.25,6.05,33.2 +191,0.06911,45.0,3.44,0.0,0.437,6.739,30.8,6.4798,5.0,398.0,15.2,389.71,4.69,30.5 +480,5.82401,0.0,18.1,0.0,0.532,6.242,64.7,3.4242,24.0,666.0,20.2,396.9,10.74,23.0 +485,3.67367,0.0,18.1,0.0,0.583,6.312,51.9,3.9917,24.0,666.0,20.2,388.62,10.58,21.2 +141,1.62864,0.0,21.89,0.0,0.624,5.019,100.0,1.4394,4.0,437.0,21.2,396.9,34.41,14.4 +493,0.17331,0.0,9.69,0.0,0.585,5.707,54.0,2.3817,6.0,391.0,19.2,396.9,12.01,21.8 +70,0.08826,0.0,10.81,0.0,0.413,6.417,6.6,5.2873,4.0,305.0,19.2,383.73,6.72,24.2 +269,0.09065,20.0,6.96,1.0,0.464,5.92,61.5,3.9175,3.0,223.0,18.6,391.34,13.65,20.7 +65,0.03584,80.0,3.37,0.0,0.398,6.29,17.8,6.6115,4.0,337.0,16.1,396.9,4.67,23.5 +195,0.01381,80.0,0.46,0.0,0.422,7.875,32.0,5.6484,4.0,255.0,14.4,394.23,2.97,50.0 +3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4 +433,5.58107,0.0,18.1,0.0,0.713,6.436,87.9,2.3158,24.0,666.0,20.2,100.19,16.22,14.3 +431,10.0623,0.0,18.1,0.0,0.584,6.833,94.3,2.0882,24.0,666.0,20.2,81.33,19.69,14.1 +87,0.07151,0.0,4.49,0.0,0.449,6.121,56.8,3.7476,3.0,247.0,18.5,395.15,8.44,22.2 +62,0.11027,25.0,5.13,0.0,0.453,6.456,67.8,7.2255,8.0,284.0,19.7,396.9,6.73,22.2 +108,0.12802,0.0,8.56,0.0,0.52,6.474,97.1,2.4329,5.0,384.0,20.9,395.24,12.27,19.8 +393,8.64476,0.0,18.1,0.0,0.693,6.193,92.6,1.7912,24.0,666.0,20.2,396.9,15.17,13.8 +398,38.3518,0.0,18.1,0.0,0.693,5.453,100.0,1.4896,24.0,666.0,20.2,396.9,30.59,5.0 +132,0.59005,0.0,21.89,0.0,0.624,6.372,97.9,2.3274,4.0,437.0,21.2,385.76,11.12,23.0 +241,0.10612,30.0,4.93,0.0,0.428,6.095,65.1,6.3361,6.0,300.0,16.6,394.62,12.4,20.1 +14,0.63796,0.0,8.14,0.0,0.538,6.096,84.5,4.4619,4.0,307.0,21.0,380.02,10.26,18.2 +83,0.03551,25.0,4.86,0.0,0.426,6.167,46.7,5.4007,4.0,281.0,19.0,390.64,7.51,22.9 +284,0.00906,90.0,2.97,0.0,0.4,7.088,20.8,7.3073,1.0,285.0,15.3,394.72,7.85,32.2 +353,0.01709,90.0,2.02,0.0,0.41,6.728,36.1,12.1265,5.0,187.0,17.0,384.46,4.5,30.1 +90,0.04684,0.0,3.41,0.0,0.489,6.417,66.1,3.0923,2.0,270.0,17.8,392.18,8.81,22.6 +414,45.7461,0.0,18.1,0.0,0.693,4.519,100.0,1.6582,24.0,666.0,20.2,88.27,36.98,7.0 +257,0.61154,20.0,3.97,0.0,0.647,8.704,86.9,1.801,5.0,264.0,13.0,389.7,5.12,50.0 +313,0.26938,0.0,9.9,0.0,0.544,6.266,82.8,3.2628,4.0,304.0,18.4,393.39,7.9,21.6 +438,13.6781,0.0,18.1,0.0,0.74,5.935,87.9,1.8206,24.0,666.0,20.2,68.95,34.02,8.4 +159,1.42502,0.0,19.58,0.0,0.871,6.51,100.0,1.7659,5.0,403.0,14.7,364.31,7.39,23.3 +36,0.09744,0.0,5.96,0.0,0.499,5.841,61.4,3.3779,5.0,279.0,19.2,377.56,11.41,20.0 +283,0.01501,90.0,1.21,1.0,0.401,7.923,24.8,5.885,1.0,198.0,13.6,395.52,3.16,50.0 +126,0.38735,0.0,25.65,0.0,0.581,5.613,95.6,1.7572,2.0,188.0,19.1,359.29,27.26,15.7 +369,5.66998,0.0,18.1,1.0,0.631,6.683,96.8,1.3567,24.0,666.0,20.2,375.33,3.73,50.0 +264,0.55007,20.0,3.97,0.0,0.647,7.206,91.6,1.9301,5.0,264.0,13.0,387.89,8.1,36.5 +325,0.19186,0.0,7.38,0.0,0.493,6.431,14.7,5.4159,5.0,287.0,19.6,393.68,5.08,24.6 +399,9.91655,0.0,18.1,0.0,0.693,5.852,77.8,1.5004,24.0,666.0,20.2,338.16,29.97,6.3 +449,7.52601,0.0,18.1,0.0,0.713,6.417,98.3,2.185,24.0,666.0,20.2,304.21,19.31,13.0 +436,14.4208,0.0,18.1,0.0,0.74,6.461,93.3,2.0026,24.0,666.0,20.2,27.49,18.05,9.6 +80,0.04113,25.0,4.86,0.0,0.426,6.727,33.5,5.4007,4.0,281.0,19.0,396.9,5.29,28.0 +220,0.35809,0.0,6.2,1.0,0.507,6.951,88.5,2.8617,8.0,307.0,17.4,391.7,9.71,26.7 +93,0.02875,28.0,15.04,0.0,0.464,6.211,28.9,3.6659,4.0,270.0,18.2,396.33,6.21,25.0 +363,4.22239,0.0,18.1,1.0,0.77,5.803,89.0,1.9047,24.0,666.0,20.2,353.04,14.64,16.8 +258,0.66351,20.0,3.97,0.0,0.647,7.333,100.0,1.8946,5.0,264.0,13.0,383.29,7.79,36.0 +198,0.03768,80.0,1.52,0.0,0.404,7.274,38.3,7.309,2.0,329.0,12.6,392.2,6.62,34.6 +460,4.81213,0.0,18.1,0.0,0.713,6.701,90.0,2.5975,24.0,666.0,20.2,255.23,16.42,16.4 +107,0.13117,0.0,8.56,0.0,0.52,6.127,85.2,2.1224,5.0,384.0,20.9,387.69,14.09,20.4 +125,0.16902,0.0,25.65,0.0,0.581,5.986,88.4,1.9929,2.0,188.0,19.1,385.02,14.81,21.4 +113,0.22212,0.0,10.01,0.0,0.547,6.092,95.4,2.548,6.0,432.0,17.8,396.9,17.09,18.7 +218,0.11069,0.0,13.89,1.0,0.55,5.951,93.8,2.8893,5.0,276.0,16.4,396.9,17.92,21.5 +474,8.05579,0.0,18.1,0.0,0.584,5.427,95.4,2.4298,24.0,666.0,20.2,352.58,18.14,13.8 +247,0.19657,22.0,5.86,0.0,0.431,6.226,79.2,8.0555,7.0,330.0,19.1,376.14,10.15,20.5 +346,0.06162,0.0,4.39,0.0,0.442,5.898,52.3,8.0136,3.0,352.0,18.8,364.61,12.67,17.2 +288,0.0459,52.5,5.32,0.0,0.405,6.315,45.6,7.3172,6.0,293.0,16.6,396.9,7.6,22.3 +115,0.17134,0.0,10.01,0.0,0.547,5.928,88.2,2.4631,6.0,432.0,17.8,344.91,15.76,18.3 +486,5.69175,0.0,18.1,0.0,0.583,6.114,79.8,3.5459,24.0,666.0,20.2,392.68,14.98,19.1 +42,0.1415,0.0,6.91,0.0,0.448,6.169,6.6,5.7209,3.0,233.0,17.9,383.37,5.81,25.3 +481,5.70818,0.0,18.1,0.0,0.532,6.75,74.9,3.3317,24.0,666.0,20.2,393.07,7.74,23.7 +469,13.0751,0.0,18.1,0.0,0.58,5.713,56.7,2.8237,24.0,666.0,20.2,396.9,14.76,20.1 +246,0.33983,22.0,5.86,0.0,0.431,6.108,34.9,8.0555,7.0,330.0,19.1,390.18,9.16,24.3 +250,0.1403,22.0,5.86,0.0,0.431,6.487,13.0,7.3967,7.0,330.0,19.1,396.28,5.9,24.4 diff --git a/Libraries/oneDAL/daal4py_Distributed_LinearRegression/data/distributed_data/linear_regression_train_1.csv b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/data/distributed_data/linear_regression_train_1.csv new file mode 100755 index 0000000000..9bf366e4c2 --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/data/distributed_data/linear_regression_train_1.csv @@ -0,0 +1,96 @@ +,0,1,2,3,4,5,6,7,8,9,10,11,12,target +244,0.20608,22.0,5.86,0.0,0.431,5.593,76.5,7.9549,7.0,330.0,19.1,372.49,12.5,17.6 +94,0.04294,28.0,15.04,0.0,0.464,6.249,77.3,3.615,4.0,270.0,18.2,396.9,10.59,20.6 +291,0.07886,80.0,4.95,0.0,0.411,7.148,27.7,5.1167,4.0,245.0,19.2,396.9,3.56,37.3 +446,6.28807,0.0,18.1,0.0,0.74,6.341,96.4,2.072,24.0,666.0,20.2,318.01,17.79,14.9 +373,11.1081,0.0,18.1,0.0,0.668,4.906,100.0,1.1742,24.0,666.0,20.2,396.9,34.77,13.8 +358,5.20177,0.0,18.1,1.0,0.77,6.127,83.4,2.7227,24.0,666.0,20.2,395.43,11.48,22.7 +327,0.24103,0.0,7.38,0.0,0.493,6.083,43.7,5.4159,5.0,287.0,19.6,396.9,12.79,22.2 +8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93,16.5 +74,0.07896,0.0,12.83,0.0,0.437,6.273,6.0,4.2515,5.0,398.0,18.7,394.92,6.78,24.1 +184,0.08308,0.0,2.46,0.0,0.488,5.604,89.8,2.9879,3.0,193.0,17.8,391.0,13.98,26.4 +149,2.73397,0.0,19.58,0.0,0.871,5.597,94.9,1.5257,5.0,403.0,14.7,351.85,21.45,15.4 +49,0.21977,0.0,6.91,0.0,0.448,5.602,62.0,6.0877,3.0,233.0,17.9,396.9,16.2,19.4 +402,9.59571,0.0,18.1,0.0,0.693,6.404,100.0,1.639,24.0,666.0,20.2,376.11,20.31,12.1 +11,0.11747,12.5,7.87,0.0,0.524,6.009,82.9,6.2267,5.0,311.0,15.2,396.9,13.27,18.9 +145,2.37934,0.0,19.58,0.0,0.871,6.13,100.0,1.4191,5.0,403.0,14.7,172.91,27.8,13.8 +501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4 +41,0.12744,0.0,6.91,0.0,0.448,6.77,2.9,5.7209,3.0,233.0,17.9,385.41,4.84,26.6 +385,16.8118,0.0,18.1,0.0,0.7,5.277,98.1,1.4261,24.0,666.0,20.2,396.9,30.81,7.2 +97,0.12083,0.0,2.89,0.0,0.445,8.069,76.0,3.4952,2.0,276.0,18.0,396.9,4.21,38.7 +161,1.46336,0.0,19.58,0.0,0.605,7.489,90.8,1.9709,5.0,403.0,14.7,374.43,1.73,50.0 +435,11.1604,0.0,18.1,0.0,0.74,6.629,94.6,2.1247,24.0,666.0,20.2,109.85,23.27,13.4 +349,0.02899,40.0,1.25,0.0,0.429,6.939,34.5,8.7921,1.0,335.0,19.7,389.85,5.89,26.6 +217,0.07013,0.0,13.89,0.0,0.55,6.642,85.1,3.4211,5.0,276.0,16.4,392.78,9.69,28.7 +160,1.27346,0.0,19.58,1.0,0.605,6.25,92.6,1.7984,5.0,403.0,14.7,338.92,5.5,27.0 +375,19.6091,0.0,18.1,0.0,0.671,7.313,97.9,1.3163,24.0,666.0,20.2,396.9,13.44,15.0 +304,0.05515,33.0,2.18,0.0,0.472,7.236,41.1,4.022,7.0,222.0,18.4,393.68,6.93,36.1 +51,0.04337,21.0,5.64,0.0,0.439,6.115,63.0,6.8147,4.0,243.0,16.8,393.97,9.43,20.5 +338,0.03306,0.0,5.19,0.0,0.515,6.059,37.3,4.8122,5.0,224.0,20.2,396.14,8.51,20.6 +266,0.7857,20.0,3.97,0.0,0.647,7.014,84.6,2.1329,5.0,264.0,13.0,384.07,14.79,30.7 +305,0.05479,33.0,2.18,0.0,0.472,6.616,58.1,3.37,7.0,222.0,18.4,393.36,8.93,28.4 +183,0.10008,0.0,2.46,0.0,0.488,6.563,95.6,2.847,3.0,193.0,17.8,396.9,5.68,32.5 +344,0.03049,55.0,3.78,0.0,0.484,6.874,28.1,6.4654,5.0,370.0,17.6,387.97,4.61,31.2 +484,2.37857,0.0,18.1,0.0,0.583,5.871,41.9,3.724,24.0,666.0,20.2,370.73,13.34,20.6 +401,14.2362,0.0,18.1,0.0,0.693,6.343,100.0,1.5741,24.0,666.0,20.2,396.9,20.32,7.2 +89,0.05302,0.0,3.41,0.0,0.489,7.079,63.1,3.4145,2.0,270.0,17.8,396.06,5.7,28.7 +467,4.42228,0.0,18.1,0.0,0.584,6.003,94.5,2.5403,24.0,666.0,20.2,331.29,21.32,19.1 +490,0.20746,0.0,27.74,0.0,0.609,5.093,98.0,1.8226,4.0,711.0,20.1,318.43,29.68,8.1 +410,51.1358,0.0,18.1,0.0,0.597,5.757,100.0,1.413,24.0,666.0,20.2,2.6,10.11,15.0 +61,0.17171,25.0,5.13,0.0,0.453,5.966,93.4,6.8185,8.0,284.0,19.7,378.08,14.44,16.0 +194,0.01439,60.0,2.93,0.0,0.401,6.604,18.8,6.2196,1.0,265.0,15.6,376.7,4.38,29.1 +448,9.32909,0.0,18.1,0.0,0.713,6.185,98.7,2.2616,24.0,666.0,20.2,396.9,18.13,14.1 +234,0.44791,0.0,6.2,1.0,0.507,6.726,66.5,3.6519,8.0,307.0,17.4,360.2,8.05,29.0 +129,0.88125,0.0,21.89,0.0,0.624,5.637,94.7,1.9799,4.0,437.0,21.2,396.9,18.34,14.3 +294,0.08199,0.0,13.92,0.0,0.437,6.009,42.3,5.5027,4.0,289.0,16.0,396.9,10.4,21.7 +239,0.09252,30.0,4.93,0.0,0.428,6.606,42.2,6.1899,6.0,300.0,16.6,383.78,7.37,23.3 +386,24.3938,0.0,18.1,0.0,0.7,4.652,100.0,1.4672,24.0,666.0,20.2,396.9,28.28,10.5 +289,0.04297,52.5,5.32,0.0,0.405,6.565,22.9,7.3172,6.0,293.0,16.6,371.72,9.51,24.8 +427,37.6619,0.0,18.1,0.0,0.679,6.202,78.7,1.8629,24.0,666.0,20.2,18.82,14.52,10.9 +323,0.28392,0.0,7.38,0.0,0.493,5.708,74.3,4.7211,5.0,287.0,19.6,391.13,11.74,18.5 +343,0.02543,55.0,3.78,0.0,0.484,6.696,56.4,5.7321,5.0,370.0,17.6,396.9,7.18,23.9 +432,6.44405,0.0,18.1,0.0,0.584,6.425,74.8,2.2004,24.0,666.0,20.2,97.95,12.03,16.1 +356,8.98296,0.0,18.1,1.0,0.77,6.212,97.4,2.1222,24.0,666.0,20.2,377.73,17.6,17.8 +64,0.01951,17.5,1.38,0.0,0.4161,7.104,59.5,9.2229,3.0,216.0,18.6,393.24,8.05,33.0 +335,0.03961,0.0,5.19,0.0,0.515,6.037,34.5,5.9853,5.0,224.0,20.2,396.9,8.01,21.1 +18,0.80271,0.0,8.14,0.0,0.538,5.456,36.6,3.7965,4.0,307.0,21.0,288.99,11.69,20.2 +109,0.26363,0.0,8.56,0.0,0.52,6.229,91.2,2.5451,5.0,384.0,20.9,391.23,15.55,19.4 +397,7.67202,0.0,18.1,0.0,0.693,5.747,98.9,1.6334,24.0,666.0,20.2,393.1,19.92,8.5 +75,0.09512,0.0,12.83,0.0,0.437,6.286,45.0,4.5026,5.0,398.0,18.7,383.23,8.94,21.4 +130,0.34006,0.0,21.89,0.0,0.624,6.458,98.9,2.1185,4.0,437.0,21.2,395.04,12.6,19.2 +63,0.1265,25.0,5.13,0.0,0.453,6.762,43.4,7.9809,8.0,284.0,19.7,395.58,9.5,25.0 +27,0.95577,0.0,8.14,0.0,0.538,6.047,88.8,4.4534,4.0,307.0,21.0,306.38,17.28,14.8 +30,1.13081,0.0,8.14,0.0,0.538,5.713,94.1,4.233,4.0,307.0,21.0,360.17,22.6,12.7 +238,0.08244,30.0,4.93,0.0,0.428,6.481,18.5,6.1899,6.0,300.0,16.6,379.41,6.36,23.7 +470,4.34879,0.0,18.1,0.0,0.58,6.167,84.0,3.0334,24.0,666.0,20.2,396.9,16.29,19.9 +471,4.03841,0.0,18.1,0.0,0.532,6.229,90.7,3.0993,24.0,666.0,20.2,395.33,12.87,19.6 +45,0.17142,0.0,6.91,0.0,0.448,5.682,33.8,5.1004,3.0,233.0,17.9,396.9,10.21,19.3 +224,0.31533,0.0,6.2,0.0,0.504,8.266,78.3,2.8944,8.0,307.0,17.4,385.05,4.14,44.8 +296,0.05372,0.0,13.92,0.0,0.437,6.549,51.0,5.9604,4.0,289.0,16.0,392.85,7.39,27.1 +185,0.06047,0.0,2.46,0.0,0.488,6.153,68.8,3.2797,3.0,193.0,17.8,387.11,13.15,29.6 +457,8.20058,0.0,18.1,0.0,0.713,5.936,80.3,2.7792,24.0,666.0,20.2,3.5,16.94,13.5 +249,0.19073,22.0,5.86,0.0,0.431,6.718,17.5,7.8265,7.0,330.0,19.1,393.74,6.56,26.2 +124,0.09849,0.0,25.65,0.0,0.581,5.879,95.8,2.0063,2.0,188.0,19.1,379.38,17.58,18.8 +103,0.21161,0.0,8.56,0.0,0.52,6.137,87.4,2.7147,5.0,384.0,20.9,394.47,13.44,19.3 +67,0.05789,12.5,6.07,0.0,0.409,5.878,21.4,6.498,4.0,345.0,18.9,396.21,8.1,22.0 +59,0.10328,25.0,5.13,0.0,0.453,5.927,47.2,6.932,8.0,284.0,19.7,396.9,9.22,19.6 +497,0.26838,0.0,9.69,0.0,0.585,5.794,70.6,2.8927,6.0,391.0,19.2,396.9,14.1,18.3 +378,23.6482,0.0,18.1,0.0,0.671,6.38,96.2,1.3861,24.0,666.0,20.2,396.9,23.69,13.1 +468,15.5757,0.0,18.1,0.0,0.58,5.926,71.0,2.9084,24.0,666.0,20.2,368.74,18.13,19.1 +372,8.26725,0.0,18.1,1.0,0.668,5.875,89.6,1.1296,24.0,666.0,20.2,347.88,8.88,50.0 +418,73.5341,0.0,18.1,0.0,0.679,5.957,100.0,1.8026,24.0,666.0,20.2,16.45,20.62,8.8 +212,0.21719,0.0,10.59,1.0,0.489,5.807,53.8,3.6526,4.0,277.0,18.6,390.94,16.03,22.4 +253,0.36894,22.0,5.86,0.0,0.431,8.259,8.4,8.9067,7.0,330.0,19.1,396.9,3.54,42.8 +261,0.53412,20.0,3.97,0.0,0.647,7.52,89.4,2.1398,5.0,264.0,13.0,388.37,7.26,43.1 +447,9.92485,0.0,18.1,0.0,0.74,6.251,96.6,2.198,24.0,666.0,20.2,388.52,16.44,12.6 +502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,20.6 +204,0.02009,95.0,2.68,0.0,0.4161,8.034,31.9,5.118,4.0,224.0,14.7,390.55,2.88,50.0 +10,0.22489,12.5,7.87,0.0,0.524,6.377,94.3,6.3467,5.0,311.0,15.2,392.52,20.45,15.0 +98,0.08187,0.0,2.89,0.0,0.445,7.82,36.9,3.4952,2.0,276.0,18.0,393.53,3.57,43.8 +34,1.61282,0.0,8.14,0.0,0.538,6.096,96.9,3.7598,4.0,307.0,21.0,248.31,20.34,13.5 +422,12.0482,0.0,18.1,0.0,0.614,5.648,87.6,1.9512,24.0,666.0,20.2,291.55,14.1,20.8 +92,0.04203,28.0,15.04,0.0,0.464,6.442,53.6,3.6659,4.0,270.0,18.2,395.01,8.16,22.9 +221,0.40771,0.0,6.2,1.0,0.507,6.164,91.3,3.048,8.0,307.0,17.4,395.24,21.46,21.7 +366,3.69695,0.0,18.1,0.0,0.718,4.963,91.4,1.7523,24.0,666.0,20.2,316.03,14.0,21.9 +270,0.29916,20.0,6.96,0.0,0.464,5.856,42.1,4.429,3.0,223.0,18.6,388.65,13.0,21.1 +82,0.03659,25.0,4.86,0.0,0.426,6.302,32.2,5.4007,4.0,281.0,19.0,396.9,6.72,24.8 diff --git a/Libraries/oneDAL/daal4py_Distributed_LinearRegression/data/distributed_data/linear_regression_train_2.csv b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/data/distributed_data/linear_regression_train_2.csv new file mode 100755 index 0000000000..c4f0195720 --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/data/distributed_data/linear_regression_train_2.csv @@ -0,0 +1,96 @@ +,0,1,2,3,4,5,6,7,8,9,10,11,12,target +153,2.14918,0.0,19.58,0.0,0.871,5.709,98.5,1.6232,5.0,403.0,14.7,261.95,15.79,19.4 +453,8.24809,0.0,18.1,0.0,0.713,7.393,99.3,2.4527,24.0,666.0,20.2,375.87,16.74,17.8 +6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43,22.9 +19,0.7258,0.0,8.14,0.0,0.538,5.727,69.5,3.7965,4.0,307.0,21.0,390.95,11.28,18.2 +492,0.11132,0.0,27.74,0.0,0.609,5.983,83.5,2.1099,4.0,711.0,20.1,396.9,13.35,20.1 +229,0.44178,0.0,6.2,0.0,0.504,6.552,21.4,3.3751,8.0,307.0,17.4,380.34,3.76,31.5 +201,0.03445,82.5,2.03,0.0,0.415,6.162,38.4,6.27,2.0,348.0,14.7,393.77,7.43,24.1 +193,0.02187,60.0,2.93,0.0,0.401,6.8,9.9,6.2196,1.0,265.0,15.6,393.37,5.03,31.1 +79,0.08387,0.0,12.83,0.0,0.437,5.874,36.6,4.5026,5.0,398.0,18.7,396.06,9.1,20.3 +479,14.3337,0.0,18.1,0.0,0.614,6.229,88.0,1.9512,24.0,666.0,20.2,383.32,13.11,21.4 +299,0.05561,70.0,2.24,0.0,0.4,7.041,10.0,7.8278,5.0,358.0,14.8,371.58,4.74,29.0 +403,24.8017,0.0,18.1,0.0,0.693,5.349,96.0,1.7028,24.0,666.0,20.2,396.9,19.77,8.3 +482,5.73116,0.0,18.1,0.0,0.532,7.061,77.0,3.4106,24.0,666.0,20.2,395.28,7.01,25.0 +39,0.02763,75.0,2.95,0.0,0.428,6.595,21.8,5.4011,3.0,252.0,18.3,395.63,4.32,30.8 +169,2.44953,0.0,19.58,0.0,0.605,6.402,95.2,2.2625,5.0,403.0,14.7,330.04,11.32,22.3 +172,0.13914,0.0,4.05,0.0,0.51,5.572,88.5,2.5961,5.0,296.0,16.6,396.9,14.69,23.1 +213,0.14052,0.0,10.59,0.0,0.489,6.375,32.3,3.9454,4.0,277.0,18.6,385.81,9.38,28.1 +442,5.66637,0.0,18.1,0.0,0.74,6.219,100.0,2.0048,24.0,666.0,20.2,395.69,16.59,18.4 +395,8.71675,0.0,18.1,0.0,0.693,6.471,98.8,1.7257,24.0,666.0,20.2,391.98,17.12,13.1 +345,0.03113,0.0,4.39,0.0,0.442,6.014,48.5,8.0136,3.0,352.0,18.8,385.64,10.53,17.5 +321,0.18159,0.0,7.38,0.0,0.493,6.376,54.3,4.5404,5.0,287.0,19.6,396.9,6.87,23.1 +211,0.37578,0.0,10.59,1.0,0.489,5.404,88.6,3.665,4.0,277.0,18.6,395.24,23.98,19.3 +348,0.01501,80.0,2.01,0.0,0.435,6.635,29.7,8.344,4.0,280.0,17.0,390.94,5.99,24.5 +473,4.64689,0.0,18.1,0.0,0.614,6.98,67.6,2.5329,24.0,666.0,20.2,374.68,11.66,29.8 +219,0.11425,0.0,13.89,1.0,0.55,6.373,92.4,3.3633,5.0,276.0,16.4,393.74,10.5,23.0 +37,0.08014,0.0,5.96,0.0,0.499,5.85,41.5,3.9342,5.0,279.0,19.2,396.9,8.77,21.0 +341,0.01301,35.0,1.52,0.0,0.442,7.241,49.3,7.0379,1.0,284.0,15.5,394.74,5.49,32.7 +391,5.29305,0.0,18.1,0.0,0.7,6.051,82.5,2.1678,24.0,666.0,20.2,378.38,18.76,23.2 +96,0.11504,0.0,2.89,0.0,0.445,6.163,69.6,3.4952,2.0,276.0,18.0,391.83,11.34,21.4 +240,0.11329,30.0,4.93,0.0,0.428,6.897,54.3,6.3361,6.0,300.0,16.6,391.25,11.38,22.0 +118,0.13058,0.0,10.01,0.0,0.547,5.872,73.1,2.4775,6.0,432.0,17.8,338.63,15.37,20.4 +355,0.10659,80.0,1.91,0.0,0.413,5.936,19.5,10.5857,4.0,334.0,22.0,376.04,5.57,20.6 +406,20.7162,0.0,18.1,0.0,0.659,4.138,100.0,1.1781,24.0,666.0,20.2,370.22,23.34,11.9 +180,0.06588,0.0,2.46,0.0,0.488,7.765,83.3,2.741,3.0,193.0,17.8,395.56,7.56,39.8 +114,0.14231,0.0,10.01,0.0,0.547,6.254,84.2,2.2565,6.0,432.0,17.8,388.74,10.45,18.5 +400,25.0461,0.0,18.1,0.0,0.693,5.987,100.0,1.5888,24.0,666.0,20.2,396.9,26.77,5.6 +135,0.55778,0.0,21.89,0.0,0.624,6.335,98.2,2.1107,4.0,437.0,21.2,394.67,16.96,18.1 +99,0.0686,0.0,2.89,0.0,0.445,7.416,62.5,3.4952,2.0,276.0,18.0,396.9,6.19,33.2 +319,0.47547,0.0,9.9,0.0,0.544,6.113,58.8,4.0019,4.0,304.0,18.4,396.23,12.73,21.0 +148,2.33099,0.0,19.58,0.0,0.871,5.186,93.8,1.5296,5.0,403.0,14.7,356.99,28.32,17.8 +504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0 +429,9.33889,0.0,18.1,0.0,0.679,6.38,95.6,1.9682,24.0,666.0,20.2,60.72,24.08,9.5 +254,0.04819,80.0,3.64,0.0,0.392,6.108,32.0,9.2203,1.0,315.0,16.4,392.89,6.57,21.9 +362,3.67822,0.0,18.1,0.0,0.77,5.362,96.2,2.1036,24.0,666.0,20.2,380.79,10.19,20.8 +187,0.07875,45.0,3.44,0.0,0.437,6.782,41.1,3.7886,5.0,398.0,15.2,393.87,6.68,32.0 +274,0.05644,40.0,6.41,1.0,0.447,6.758,32.9,4.0776,4.0,254.0,17.6,396.9,3.53,32.4 +121,0.07165,0.0,25.65,0.0,0.581,6.004,84.1,2.1974,2.0,188.0,19.1,377.67,14.27,20.3 +155,3.53501,0.0,19.58,1.0,0.871,6.152,82.6,1.7455,5.0,403.0,14.7,88.01,15.02,15.6 +77,0.08707,0.0,12.83,0.0,0.437,6.14,45.8,4.0905,5.0,398.0,18.7,386.96,10.27,20.8 +44,0.12269,0.0,6.91,0.0,0.448,6.069,40.0,5.7209,3.0,233.0,17.9,389.39,9.55,21.2 +158,1.34284,0.0,19.58,0.0,0.605,6.066,100.0,1.7573,5.0,403.0,14.7,353.89,6.43,24.3 +487,4.83567,0.0,18.1,0.0,0.583,5.905,53.2,3.1523,24.0,666.0,20.2,388.22,11.45,20.6 +189,0.0837,45.0,3.44,0.0,0.437,7.185,38.9,4.5667,5.0,398.0,15.2,396.9,5.39,34.9 +206,0.22969,0.0,10.59,0.0,0.489,6.326,52.5,4.3549,4.0,277.0,18.6,394.87,10.97,24.4 +472,3.56868,0.0,18.1,0.0,0.58,6.437,75.0,2.8965,24.0,666.0,20.2,393.37,14.36,23.2 +43,0.15936,0.0,6.91,0.0,0.448,6.211,6.5,5.7209,3.0,233.0,17.9,394.46,7.44,24.7 +156,2.44668,0.0,19.58,0.0,0.871,5.272,94.0,1.7364,5.0,403.0,14.7,88.63,16.14,13.1 +176,0.07022,0.0,4.05,0.0,0.51,6.02,47.2,3.5549,5.0,296.0,16.6,393.23,10.11,23.2 +142,3.32105,0.0,19.58,1.0,0.871,5.403,100.0,1.3216,5.0,403.0,14.7,396.9,26.82,13.4 +123,0.15038,0.0,25.65,0.0,0.581,5.856,97.0,1.9444,2.0,188.0,19.1,370.31,25.41,17.3 +164,2.24236,0.0,19.58,0.0,0.605,5.854,91.8,2.422,5.0,403.0,14.7,395.11,11.64,22.7 +100,0.14866,0.0,8.56,0.0,0.52,6.727,79.9,2.7778,5.0,384.0,20.9,394.76,9.42,27.5 +424,8.79212,0.0,18.1,0.0,0.584,5.565,70.6,2.0635,24.0,666.0,20.2,3.65,17.16,11.7 +336,0.03427,0.0,5.19,0.0,0.515,5.869,46.3,5.2311,5.0,224.0,20.2,396.9,9.8,19.5 +243,0.12757,30.0,4.93,0.0,0.428,6.393,7.8,7.0355,6.0,300.0,16.6,374.71,5.19,23.7 +421,7.02259,0.0,18.1,0.0,0.718,6.006,95.3,1.8746,24.0,666.0,20.2,319.98,15.7,14.2 +317,0.24522,0.0,9.9,0.0,0.544,5.782,71.7,4.0317,4.0,304.0,18.4,396.9,15.94,19.8 +331,0.05023,35.0,6.06,0.0,0.4379,5.706,28.4,6.6407,1.0,304.0,16.9,394.02,12.43,17.1 +505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,11.9 +68,0.13554,12.5,6.07,0.0,0.409,5.594,36.8,6.498,4.0,345.0,18.9,396.9,13.09,17.4 +166,2.01019,0.0,19.58,0.0,0.605,7.929,96.2,2.0459,5.0,403.0,14.7,369.3,3.7,50.0 +259,0.65665,20.0,3.97,0.0,0.647,6.842,100.0,2.0107,5.0,264.0,13.0,391.93,6.9,30.1 +78,0.05646,0.0,12.83,0.0,0.437,6.232,53.7,5.0141,5.0,398.0,18.7,386.4,12.34,21.2 +322,0.35114,0.0,7.38,0.0,0.493,6.041,49.9,4.7211,5.0,287.0,19.6,396.9,7.7,20.4 +273,0.22188,20.0,6.96,1.0,0.464,7.691,51.8,4.3665,3.0,223.0,18.6,390.77,6.58,35.2 +287,0.03871,52.5,5.32,0.0,0.405,6.209,31.3,7.3172,6.0,293.0,16.6,396.9,7.14,23.2 +278,0.07978,40.0,6.41,0.0,0.447,6.482,32.1,4.1403,4.0,254.0,17.6,396.9,7.19,29.1 +85,0.05735,0.0,4.49,0.0,0.449,6.63,56.1,4.4377,3.0,247.0,18.5,392.3,6.53,26.6 +188,0.12579,45.0,3.44,0.0,0.437,6.556,29.1,4.5667,5.0,398.0,15.2,382.84,4.56,29.8 +295,0.12932,0.0,13.92,0.0,0.437,6.678,31.1,5.9604,4.0,289.0,16.0,396.9,6.27,28.6 +359,4.26131,0.0,18.1,0.0,0.77,6.112,81.3,2.5091,24.0,666.0,20.2,390.74,12.67,22.6 +494,0.27957,0.0,9.69,0.0,0.585,5.926,42.6,2.3817,6.0,391.0,19.2,396.9,13.59,24.5 +463,5.82115,0.0,18.1,0.0,0.713,6.513,89.9,2.8016,24.0,666.0,20.2,393.82,10.29,20.2 +277,0.06127,40.0,6.41,1.0,0.447,6.826,27.6,4.8628,4.0,254.0,17.6,393.45,4.16,33.1 +143,4.0974,0.0,19.58,0.0,0.871,5.468,100.0,1.4118,5.0,403.0,14.7,396.9,26.42,15.6 +364,3.47428,0.0,18.1,1.0,0.718,8.78,82.9,1.9047,24.0,666.0,20.2,354.55,5.29,21.9 +466,3.77498,0.0,18.1,0.0,0.655,5.952,84.7,2.8715,24.0,666.0,20.2,22.01,17.15,19.0 +280,0.03578,20.0,3.33,0.0,0.4429,7.82,64.5,4.6947,5.0,216.0,14.9,387.31,3.76,45.4 +382,9.18702,0.0,18.1,0.0,0.7,5.536,100.0,1.5804,24.0,666.0,20.2,396.9,23.6,11.3 +441,9.72418,0.0,18.1,0.0,0.74,6.406,97.2,2.0651,24.0,666.0,20.2,385.96,19.52,17.1 +352,0.07244,60.0,1.69,0.0,0.411,5.884,18.5,10.7103,4.0,411.0,18.3,392.33,7.79,18.6 +56,0.02055,85.0,0.74,0.0,0.41,6.383,35.7,9.1876,2.0,313.0,17.3,396.9,5.77,24.7 +478,10.233,0.0,18.1,0.0,0.614,6.185,96.7,2.1705,24.0,666.0,20.2,379.7,18.03,14.6 +196,0.04011,80.0,1.52,0.0,0.404,7.287,34.1,7.309,2.0,329.0,12.6,396.9,4.08,33.3 +154,1.41385,0.0,19.58,1.0,0.871,6.129,96.0,1.7494,5.0,403.0,14.7,321.02,15.12,17.0 diff --git a/Libraries/oneDAL/daal4py_Distributed_LinearRegression/data/distributed_data/linear_regression_train_3.csv b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/data/distributed_data/linear_regression_train_3.csv new file mode 100755 index 0000000000..72ff937fd8 --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/data/distributed_data/linear_regression_train_3.csv @@ -0,0 +1,96 @@ +,0,1,2,3,4,5,6,7,8,9,10,11,12,target +379,17.8667,0.0,18.1,0.0,0.671,6.223,100.0,1.3861,24.0,666.0,20.2,393.74,21.78,10.2 +350,0.06211,40.0,1.25,0.0,0.429,6.49,44.4,8.7921,1.0,335.0,19.7,396.9,5.98,22.9 +408,7.40389,0.0,18.1,0.0,0.597,5.617,97.9,1.4547,24.0,666.0,20.2,314.64,26.4,17.2 +237,0.51183,0.0,6.2,0.0,0.507,7.358,71.6,4.148,8.0,307.0,17.4,390.07,4.73,31.5 +182,0.09103,0.0,2.46,0.0,0.488,7.155,92.2,2.7006,3.0,193.0,17.8,394.12,4.82,37.9 +66,0.04379,80.0,3.37,0.0,0.398,5.787,31.1,6.6115,4.0,337.0,16.1,396.9,10.24,19.4 +40,0.03359,75.0,2.95,0.0,0.428,7.024,15.8,5.4011,3.0,252.0,18.3,395.62,1.98,34.9 +360,4.54192,0.0,18.1,0.0,0.77,6.398,88.0,2.5182,24.0,666.0,20.2,374.56,7.79,25.0 +260,0.54011,20.0,3.97,0.0,0.647,7.203,81.8,2.1121,5.0,264.0,13.0,392.8,9.59,33.8 +443,9.96654,0.0,18.1,0.0,0.74,6.485,100.0,1.9784,24.0,666.0,20.2,386.73,18.85,15.4 +496,0.2896,0.0,9.69,0.0,0.585,5.39,72.9,2.7986,6.0,391.0,19.2,396.9,21.14,19.7 +28,0.77299,0.0,8.14,0.0,0.538,6.495,94.4,4.4547,4.0,307.0,21.0,387.94,12.8,18.4 +177,0.05425,0.0,4.05,0.0,0.51,6.315,73.4,3.3175,5.0,296.0,16.6,395.6,6.29,24.6 +420,11.0874,0.0,18.1,0.0,0.718,6.411,100.0,1.8589,24.0,666.0,20.2,318.75,15.02,16.7 +106,0.1712,0.0,8.56,0.0,0.52,5.836,91.9,2.211,5.0,384.0,20.9,395.67,18.66,19.5 +69,0.12816,12.5,6.07,0.0,0.409,5.885,33.0,6.498,4.0,345.0,18.9,396.9,8.79,20.9 +439,9.39063,0.0,18.1,0.0,0.74,5.627,93.9,1.8172,24.0,666.0,20.2,396.9,22.88,12.8 +340,0.06151,0.0,5.19,0.0,0.515,5.968,58.5,4.8122,5.0,224.0,20.2,396.9,9.29,18.7 +54,0.0136,75.0,4.0,0.0,0.41,5.888,47.6,7.3197,3.0,469.0,21.1,396.9,14.8,18.9 +151,1.49632,0.0,19.58,0.0,0.871,5.404,100.0,1.5916,5.0,403.0,14.7,341.6,13.28,19.6 +377,9.82349,0.0,18.1,0.0,0.671,6.794,98.8,1.358,24.0,666.0,20.2,396.9,21.24,13.3 +425,15.8603,0.0,18.1,0.0,0.679,5.896,95.4,1.9096,24.0,666.0,20.2,7.68,24.39,8.3 +233,0.33147,0.0,6.2,0.0,0.507,8.247,70.4,3.6519,8.0,307.0,17.4,378.95,3.95,48.3 +320,0.1676,0.0,7.38,0.0,0.493,6.426,52.3,4.5404,5.0,287.0,19.6,396.9,7.2,23.8 +202,0.02177,82.5,2.03,0.0,0.415,7.61,15.7,6.27,2.0,348.0,14.7,395.38,3.11,42.3 +50,0.08873,21.0,5.64,0.0,0.439,5.963,45.7,6.8147,4.0,243.0,16.8,395.56,13.45,19.7 +445,10.6718,0.0,18.1,0.0,0.74,6.459,94.8,1.9879,24.0,666.0,20.2,43.06,23.98,11.8 +465,3.1636,0.0,18.1,0.0,0.655,5.759,48.2,3.0665,24.0,666.0,20.2,334.4,14.13,19.9 +255,0.03548,80.0,3.64,0.0,0.392,5.876,19.1,9.2203,1.0,315.0,16.4,395.18,9.25,20.9 +498,0.23912,0.0,9.69,0.0,0.585,6.019,65.3,2.4091,6.0,391.0,19.2,396.9,12.92,21.2 +488,0.15086,0.0,27.74,0.0,0.609,5.454,92.7,1.8209,4.0,711.0,20.1,395.09,18.06,15.2 +500,0.22438,0.0,9.69,0.0,0.585,6.027,79.7,2.4982,6.0,391.0,19.2,396.9,14.33,16.8 +47,0.22927,0.0,6.91,0.0,0.448,6.03,85.5,5.6894,3.0,233.0,17.9,392.74,18.8,16.6 +231,0.46296,0.0,6.2,0.0,0.504,7.412,76.9,3.6715,8.0,307.0,17.4,376.14,5.25,31.7 +147,2.36862,0.0,19.58,0.0,0.871,4.926,95.7,1.4608,5.0,403.0,14.7,391.71,29.53,14.6 +263,0.82526,20.0,3.97,0.0,0.647,7.327,94.5,2.0788,5.0,264.0,13.0,393.42,11.25,31.0 +0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0 +430,8.49213,0.0,18.1,0.0,0.584,6.348,86.1,2.0527,24.0,666.0,20.2,83.45,17.64,14.5 +282,0.06129,20.0,3.33,1.0,0.4429,7.645,49.7,5.2119,5.0,216.0,14.9,377.07,3.01,46.0 +203,0.0351,95.0,2.68,0.0,0.4161,7.853,33.2,5.118,4.0,224.0,14.7,392.78,3.81,48.5 +134,0.97617,0.0,21.89,0.0,0.624,5.757,98.4,2.346,4.0,437.0,21.2,262.76,17.31,15.6 +31,1.35472,0.0,8.14,0.0,0.538,6.072,100.0,4.175,4.0,307.0,21.0,376.73,13.04,14.5 +138,0.2498,0.0,21.89,0.0,0.624,5.857,98.2,1.6686,4.0,437.0,21.2,392.04,21.32,13.3 +91,0.03932,0.0,3.41,0.0,0.489,6.405,73.9,3.0921,2.0,270.0,17.8,393.55,8.2,22.0 +21,0.85204,0.0,8.14,0.0,0.538,5.965,89.2,4.0123,4.0,307.0,21.0,392.53,13.83,19.6 +71,0.15876,0.0,10.81,0.0,0.413,5.961,17.5,5.2873,4.0,305.0,19.2,376.94,9.88,21.7 +314,0.3692,0.0,9.9,0.0,0.544,6.567,87.3,3.6023,4.0,304.0,18.4,395.69,9.28,23.8 +152,1.12658,0.0,19.58,1.0,0.871,5.012,88.0,1.6102,5.0,403.0,14.7,343.28,12.12,15.3 +29,1.00245,0.0,8.14,0.0,0.538,6.674,87.3,4.239,4.0,307.0,21.0,380.23,11.98,21.0 +483,2.81838,0.0,18.1,0.0,0.532,5.762,40.3,4.0983,24.0,666.0,20.2,392.92,10.42,21.8 +38,0.17505,0.0,5.96,0.0,0.499,5.966,30.2,3.8473,5.0,279.0,19.2,393.43,10.13,24.7 +119,0.14476,0.0,10.01,0.0,0.547,5.731,65.2,2.7592,6.0,432.0,17.8,391.5,13.61,19.3 +252,0.08221,22.0,5.86,0.0,0.431,6.957,6.8,8.9067,7.0,330.0,19.1,386.09,3.53,29.6 +415,18.0846,0.0,18.1,0.0,0.679,6.434,100.0,1.8347,24.0,666.0,20.2,27.25,29.05,7.2 +389,8.15174,0.0,18.1,0.0,0.7,5.39,98.9,1.7281,24.0,666.0,20.2,396.9,20.85,11.5 +404,41.5292,0.0,18.1,0.0,0.693,5.531,85.4,1.6074,24.0,666.0,20.2,329.46,27.38,8.5 +248,0.16439,22.0,5.86,0.0,0.431,6.433,49.1,7.8265,7.0,330.0,19.1,374.71,9.52,24.5 +190,0.09068,45.0,3.44,0.0,0.437,6.951,21.5,6.4798,5.0,398.0,15.2,377.68,5.1,37.0 +452,5.09017,0.0,18.1,0.0,0.713,6.297,91.8,2.3682,24.0,666.0,20.2,385.09,17.27,16.1 +310,2.63548,0.0,9.9,0.0,0.544,4.973,37.8,2.5194,4.0,304.0,18.4,350.45,12.64,16.1 +170,1.20742,0.0,19.58,0.0,0.605,5.875,94.6,2.4259,5.0,403.0,14.7,292.29,14.43,17.4 +437,15.1772,0.0,18.1,0.0,0.74,6.152,100.0,1.9142,24.0,666.0,20.2,9.32,26.45,8.7 +146,2.15505,0.0,19.58,0.0,0.871,5.628,100.0,1.5166,5.0,403.0,14.7,169.27,16.65,15.6 +216,0.0456,0.0,13.89,1.0,0.55,5.888,56.0,3.1121,5.0,276.0,16.4,392.8,13.51,23.3 +333,0.05083,0.0,5.19,0.0,0.515,6.316,38.1,6.4584,5.0,224.0,20.2,389.71,5.68,22.2 +311,0.79041,0.0,9.9,0.0,0.544,6.122,52.8,2.6403,4.0,304.0,18.4,396.9,5.98,22.1 +52,0.0536,21.0,5.64,0.0,0.439,6.511,21.1,6.8147,4.0,243.0,16.8,396.9,5.28,25.0 +413,28.6558,0.0,18.1,0.0,0.597,5.155,100.0,1.5894,24.0,666.0,20.2,210.97,20.08,16.3 +35,0.06417,0.0,5.96,0.0,0.499,5.933,68.2,3.3603,5.0,279.0,19.2,396.9,9.68,18.9 +205,0.13642,0.0,10.59,0.0,0.489,5.891,22.3,3.9454,4.0,277.0,18.6,396.9,10.87,22.6 +499,0.17783,0.0,9.69,0.0,0.585,5.569,73.5,2.3999,6.0,391.0,19.2,395.77,15.1,17.5 +307,0.04932,33.0,2.18,0.0,0.472,6.849,70.3,3.1827,7.0,222.0,18.4,396.9,7.53,28.2 +86,0.05188,0.0,4.49,0.0,0.449,6.015,45.1,4.4272,3.0,247.0,18.5,395.99,12.86,22.5 +272,0.1146,20.0,6.96,0.0,0.464,6.538,58.7,3.9175,3.0,223.0,18.6,394.96,7.73,24.4 +451,5.44114,0.0,18.1,0.0,0.713,6.655,98.2,2.3552,24.0,666.0,20.2,355.29,17.73,15.2 +222,0.62356,0.0,6.2,1.0,0.507,6.879,77.7,3.2721,8.0,307.0,17.4,390.39,9.93,27.5 +112,0.12329,0.0,10.01,0.0,0.547,5.913,92.9,2.3534,6.0,432.0,17.8,394.95,16.21,18.8 +167,1.80028,0.0,19.58,0.0,0.605,5.877,79.2,2.4259,5.0,403.0,14.7,227.61,12.14,23.8 +12,0.09378,12.5,7.87,0.0,0.524,5.889,39.0,5.4509,5.0,311.0,15.2,390.5,15.71,21.7 +477,15.0234,0.0,18.1,0.0,0.614,5.304,97.3,2.1007,24.0,666.0,20.2,349.48,24.91,12.0 +267,0.57834,20.0,3.97,0.0,0.575,8.297,67.0,2.4216,5.0,264.0,13.0,384.54,7.44,50.0 +265,0.76162,20.0,3.97,0.0,0.647,5.56,62.8,1.9865,5.0,264.0,13.0,392.4,10.45,22.8 +215,0.19802,0.0,10.59,0.0,0.489,6.182,42.4,3.9454,4.0,277.0,18.6,393.63,9.47,25.0 +105,0.13262,0.0,8.56,0.0,0.52,5.851,96.7,2.1069,5.0,384.0,20.9,394.05,16.47,19.5 +374,18.4982,0.0,18.1,0.0,0.668,4.138,100.0,1.137,24.0,666.0,20.2,396.9,37.97,13.8 +384,20.0849,0.0,18.1,0.0,0.7,4.368,91.2,1.4395,24.0,666.0,20.2,285.83,30.63,8.8 +383,7.99248,0.0,18.1,0.0,0.7,5.52,100.0,1.5331,24.0,666.0,20.2,396.9,24.56,12.3 +173,0.09178,0.0,4.05,0.0,0.51,6.416,84.1,2.6463,5.0,296.0,16.6,395.5,9.04,23.6 +330,0.04544,0.0,3.24,0.0,0.46,6.144,32.2,5.8736,4.0,430.0,16.9,368.57,9.09,19.8 +434,13.9134,0.0,18.1,0.0,0.713,6.208,95.0,2.2222,24.0,666.0,20.2,100.63,15.17,11.7 +209,0.43571,0.0,10.59,1.0,0.489,5.344,100.0,3.875,4.0,277.0,18.6,396.9,23.09,20.0 +419,11.8123,0.0,18.1,0.0,0.718,6.824,76.5,1.794,24.0,666.0,20.2,48.45,22.74,8.4 +26,0.67191,0.0,8.14,0.0,0.538,5.813,90.3,4.682,4.0,307.0,21.0,376.88,14.81,16.6 +462,6.65492,0.0,18.1,0.0,0.713,6.317,83.0,2.7344,24.0,666.0,20.2,396.9,13.99,19.5 +458,7.75223,0.0,18.1,0.0,0.713,6.301,83.7,2.7831,24.0,666.0,20.2,272.21,16.23,14.9 diff --git a/Libraries/oneDAL/daal4py_Distributed_LinearRegression/data/distributed_data/linear_regression_train_4.csv b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/data/distributed_data/linear_regression_train_4.csv new file mode 100755 index 0000000000..ab41622cc3 --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/data/distributed_data/linear_regression_train_4.csv @@ -0,0 +1,95 @@ +,0,1,2,3,4,5,6,7,8,9,10,11,12,target +476,4.87141,0.0,18.1,0.0,0.614,6.484,93.6,2.3053,24.0,666.0,20.2,396.21,18.68,16.7 +354,0.04301,80.0,1.91,0.0,0.413,5.663,21.9,10.5857,4.0,334.0,22.0,382.8,8.05,18.2 +101,0.11432,0.0,8.56,0.0,0.52,6.781,71.3,2.8561,5.0,384.0,20.9,395.58,7.67,26.5 +256,0.01538,90.0,3.75,0.0,0.394,7.454,34.2,6.3361,3.0,244.0,15.9,386.34,3.11,44.0 +339,0.05497,0.0,5.19,0.0,0.515,5.985,45.4,4.8122,5.0,224.0,20.2,396.9,9.74,19.0 +2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7 +390,6.96215,0.0,18.1,0.0,0.7,5.713,97.0,1.9265,24.0,666.0,20.2,394.43,17.11,15.1 +174,0.08447,0.0,4.05,0.0,0.51,5.859,68.7,2.7019,5.0,296.0,16.6,393.23,9.64,22.6 +337,0.03041,0.0,5.19,0.0,0.515,5.895,59.6,5.615,5.0,224.0,20.2,394.81,10.56,18.5 +9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.1,18.9 +464,7.83932,0.0,18.1,0.0,0.655,6.209,65.4,2.9634,24.0,666.0,20.2,396.9,13.22,21.4 +381,15.8744,0.0,18.1,0.0,0.671,6.545,99.1,1.5192,24.0,666.0,20.2,396.9,21.08,10.9 +200,0.01778,95.0,1.47,0.0,0.403,7.135,13.9,7.6534,3.0,402.0,17.0,384.3,4.45,32.9 +150,1.6566,0.0,19.58,0.0,0.871,6.122,97.3,1.618,5.0,403.0,14.7,372.8,14.1,21.5 +181,0.06888,0.0,2.46,0.0,0.488,6.144,62.2,2.5979,3.0,193.0,17.8,396.9,9.45,36.2 +351,0.0795,60.0,1.69,0.0,0.411,6.579,35.9,10.7103,4.0,411.0,18.3,370.78,5.49,24.1 +450,6.71772,0.0,18.1,0.0,0.713,6.749,92.6,2.3236,24.0,666.0,20.2,0.32,17.44,13.4 +423,7.05042,0.0,18.1,0.0,0.614,6.103,85.1,2.0218,24.0,666.0,20.2,2.52,23.29,13.4 +303,0.1,34.0,6.09,0.0,0.433,6.982,17.7,5.4917,7.0,329.0,16.1,390.43,4.86,33.1 +16,1.05393,0.0,8.14,0.0,0.538,5.935,29.3,4.4986,4.0,307.0,21.0,386.85,6.58,23.1 +455,4.75237,0.0,18.1,0.0,0.713,6.525,86.5,2.4358,24.0,666.0,20.2,50.92,18.13,14.1 +329,0.06724,0.0,3.24,0.0,0.46,6.333,17.2,5.2146,4.0,430.0,16.9,375.21,7.34,22.6 +334,0.03738,0.0,5.19,0.0,0.515,6.31,38.5,6.4584,5.0,224.0,20.2,389.4,6.75,20.7 +387,22.5971,0.0,18.1,0.0,0.7,5.0,89.5,1.5184,24.0,666.0,20.2,396.9,31.99,7.4 +312,0.26169,0.0,9.9,0.0,0.544,6.023,90.4,2.834,4.0,304.0,18.4,396.3,11.72,19.4 +271,0.16211,20.0,6.96,0.0,0.464,6.24,16.3,4.429,3.0,223.0,18.6,396.9,6.59,25.2 +308,0.49298,0.0,9.9,0.0,0.544,6.635,82.5,3.3175,4.0,304.0,18.4,396.9,4.54,22.8 +396,5.87205,0.0,18.1,0.0,0.693,6.405,96.0,1.6768,24.0,666.0,20.2,396.9,19.37,12.5 +17,0.7842,0.0,8.14,0.0,0.538,5.99,81.7,4.2579,4.0,307.0,21.0,386.75,14.67,17.5 +104,0.1396,0.0,8.56,0.0,0.52,6.167,90.0,2.421,5.0,384.0,20.9,392.69,12.33,20.1 +140,0.2909,0.0,21.89,0.0,0.624,6.174,93.6,1.6119,4.0,437.0,21.2,388.08,24.16,14.0 +281,0.03705,20.0,3.33,0.0,0.4429,6.968,37.2,5.2447,5.0,216.0,14.9,392.23,4.59,35.4 +207,0.25199,0.0,10.59,0.0,0.489,5.783,72.7,4.3549,4.0,277.0,18.6,389.43,18.06,22.5 +365,4.55587,0.0,18.1,0.0,0.718,3.561,87.9,1.6132,24.0,666.0,20.2,354.7,7.12,27.5 +503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64,23.9 +361,3.83684,0.0,18.1,0.0,0.77,6.251,91.1,2.2955,24.0,666.0,20.2,350.65,14.19,19.9 +491,0.10574,0.0,27.74,0.0,0.609,5.983,98.8,1.8681,4.0,711.0,20.1,390.11,18.07,13.6 +88,0.0566,0.0,3.41,0.0,0.489,7.007,86.3,3.4217,2.0,270.0,17.8,396.9,5.5,23.6 +332,0.03466,35.0,6.06,0.0,0.4379,6.031,23.3,6.6407,1.0,304.0,16.9,362.25,7.83,19.4 +192,0.08664,45.0,3.44,0.0,0.437,7.178,26.3,6.4798,5.0,398.0,15.2,390.49,2.87,36.4 +81,0.04462,25.0,4.86,0.0,0.426,6.619,70.4,5.4007,4.0,281.0,19.0,395.63,7.22,23.9 +197,0.04666,80.0,1.52,0.0,0.404,7.107,36.6,7.309,2.0,329.0,12.6,354.31,8.61,30.3 +4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2 +57,0.01432,100.0,1.32,0.0,0.411,6.816,40.5,8.3248,5.0,256.0,15.1,392.9,3.95,31.6 +22,1.23247,0.0,8.14,0.0,0.538,6.142,91.7,3.9769,4.0,307.0,21.0,396.9,18.72,15.2 +139,0.54452,0.0,21.89,0.0,0.624,6.151,97.9,1.6687,4.0,437.0,21.2,396.9,18.46,17.8 +275,0.09604,40.0,6.41,0.0,0.447,6.854,42.8,4.2673,4.0,254.0,17.6,396.9,2.98,32.0 +301,0.03537,34.0,6.09,0.0,0.433,6.59,40.4,5.4917,7.0,329.0,16.1,395.75,9.5,22.0 +163,1.51902,0.0,19.58,1.0,0.605,8.375,93.9,2.162,5.0,403.0,14.7,388.45,3.32,50.0 +235,0.33045,0.0,6.2,0.0,0.507,6.086,61.5,3.6519,8.0,307.0,17.4,376.75,10.88,24.0 +347,0.0187,85.0,4.15,0.0,0.429,6.516,27.7,8.5353,4.0,351.0,17.9,392.43,6.36,23.1 +411,14.0507,0.0,18.1,0.0,0.597,6.657,100.0,1.5275,24.0,666.0,20.2,35.05,21.22,17.2 +428,7.36711,0.0,18.1,0.0,0.679,6.193,78.1,1.9356,24.0,666.0,20.2,96.73,21.52,11.0 +127,0.25915,0.0,21.89,0.0,0.624,5.693,96.0,1.7883,4.0,437.0,21.2,392.11,17.19,16.2 +48,0.25387,0.0,6.91,0.0,0.448,5.399,95.3,5.87,3.0,233.0,17.9,396.9,30.81,14.4 +370,6.53876,0.0,18.1,1.0,0.631,7.016,97.5,1.2024,24.0,666.0,20.2,392.05,2.96,50.0 +417,25.9406,0.0,18.1,0.0,0.679,5.304,89.1,1.6475,24.0,666.0,20.2,127.36,26.64,10.4 +58,0.15445,25.0,5.13,0.0,0.453,6.145,29.2,7.8148,8.0,284.0,19.7,390.68,6.86,23.3 +23,0.98843,0.0,8.14,0.0,0.538,5.813,100.0,4.0952,4.0,307.0,21.0,394.54,19.88,14.5 +232,0.57529,0.0,6.2,0.0,0.507,8.337,73.3,3.8384,8.0,307.0,17.4,385.91,2.47,41.7 +73,0.19539,0.0,10.81,0.0,0.413,6.245,6.2,5.2873,4.0,305.0,19.2,377.17,7.54,23.4 +426,12.2472,0.0,18.1,0.0,0.584,5.837,59.7,1.9976,24.0,666.0,20.2,24.65,15.69,10.2 +120,0.06899,0.0,25.65,0.0,0.581,5.87,69.7,2.2577,2.0,188.0,19.1,389.15,14.37,22.0 +407,11.9511,0.0,18.1,0.0,0.659,5.608,100.0,1.2852,24.0,666.0,20.2,332.09,12.13,27.9 +326,0.30347,0.0,7.38,0.0,0.493,6.312,28.9,5.4159,5.0,287.0,19.6,396.9,6.15,23.0 +268,0.5405,20.0,3.97,0.0,0.575,7.47,52.6,2.872,5.0,264.0,13.0,390.3,3.16,43.5 +245,0.19133,22.0,5.86,0.0,0.431,5.605,70.2,7.9549,7.0,330.0,19.1,389.13,18.46,18.5 +76,0.10153,0.0,12.83,0.0,0.437,6.279,74.5,4.0522,5.0,398.0,18.7,373.66,11.97,20.0 +110,0.10793,0.0,8.56,0.0,0.52,6.195,54.4,2.7778,5.0,384.0,20.9,393.49,13.0,21.7 +13,0.62976,0.0,8.14,0.0,0.538,5.949,61.8,4.7075,4.0,307.0,21.0,396.9,8.26,20.4 +316,0.31827,0.0,9.9,0.0,0.544,5.914,83.2,3.9986,4.0,304.0,18.4,390.7,18.33,17.8 +165,2.924,0.0,19.58,0.0,0.605,6.101,93.0,2.2834,5.0,403.0,14.7,240.16,9.81,25.0 +444,12.8023,0.0,18.1,0.0,0.74,5.854,96.6,1.8956,24.0,666.0,20.2,240.52,23.79,10.8 +324,0.34109,0.0,7.38,0.0,0.493,6.415,40.1,4.7211,5.0,287.0,19.6,396.9,6.12,25.0 +251,0.21409,22.0,5.86,0.0,0.431,6.438,8.9,7.3967,7.0,330.0,19.1,377.07,3.59,24.8 +116,0.13158,0.0,10.01,0.0,0.547,6.176,72.5,2.7301,6.0,432.0,17.8,393.3,12.04,21.2 +342,0.02498,0.0,1.89,0.0,0.518,6.54,59.7,6.2669,1.0,422.0,15.9,389.96,8.65,16.5 +72,0.09164,0.0,10.81,0.0,0.413,6.065,7.8,5.2873,4.0,305.0,19.2,390.91,5.52,22.8 +297,0.14103,0.0,13.92,0.0,0.437,5.79,58.0,6.32,4.0,289.0,16.0,396.9,15.84,20.3 +380,88.9762,0.0,18.1,0.0,0.671,6.968,91.9,1.4165,24.0,666.0,20.2,396.9,17.21,10.4 +279,0.21038,20.0,3.33,0.0,0.4429,6.812,32.2,4.1007,5.0,216.0,14.9,396.9,4.85,35.1 +371,9.2323,0.0,18.1,0.0,0.631,6.216,100.0,1.1691,24.0,666.0,20.2,366.15,9.53,50.0 +368,4.89822,0.0,18.1,0.0,0.631,4.97,100.0,1.3325,24.0,666.0,20.2,375.52,3.26,50.0 +300,0.04417,70.0,2.24,0.0,0.4,6.871,47.4,7.8278,5.0,358.0,14.8,390.86,6.07,24.8 +298,0.06466,70.0,2.24,0.0,0.4,6.345,20.1,7.8278,5.0,358.0,14.8,368.24,4.97,22.5 +306,0.07503,33.0,2.18,0.0,0.472,7.42,71.9,3.0992,7.0,222.0,18.4,396.9,6.47,33.4 +227,0.41238,0.0,6.2,0.0,0.504,7.163,79.9,3.2157,8.0,307.0,17.4,372.08,6.36,31.6 +208,0.13587,0.0,10.59,1.0,0.489,6.064,59.1,4.2392,4.0,277.0,18.6,381.32,14.66,24.4 +293,0.08265,0.0,13.92,0.0,0.437,6.127,18.4,5.5027,4.0,289.0,16.0,396.9,8.58,23.9 +46,0.18836,0.0,6.91,0.0,0.448,5.786,33.3,5.1004,3.0,233.0,17.9,396.9,14.15,20.0 +412,18.811,0.0,18.1,0.0,0.597,4.628,100.0,1.5539,24.0,666.0,20.2,28.79,34.37,17.9 +5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21,28.7 +144,2.77974,0.0,19.58,0.0,0.871,4.903,97.8,1.3459,5.0,403.0,14.7,396.9,29.29,11.8 +318,0.40202,0.0,9.9,0.0,0.544,6.382,67.2,3.5325,4.0,304.0,18.4,395.21,10.36,23.1 diff --git a/Libraries/oneDAL/daal4py_Distributed_LinearRegression/models/store_models_in_this_folder.txt b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/models/store_models_in_this_folder.txt new file mode 100755 index 0000000000..e69de29bb2 diff --git a/Libraries/oneDAL/daal4py_Distributed_LinearRegression/results/store_results_in_this_folder.txt b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/results/store_results_in_this_folder.txt new file mode 100755 index 0000000000..e69de29bb2 diff --git a/Libraries/oneDAL/daal4py_Distributed_LinearRegression/sample.json b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/sample.json new file mode 100755 index 0000000000..999b8a7180 --- /dev/null +++ b/Libraries/oneDAL/daal4py_Distributed_LinearRegression/sample.json @@ -0,0 +1,22 @@ +{ + "guid": "ED2952EA-04CB-4353-9FE6-80E0F7DCA098", + "name": "daal4py Distributed Linear Regression", + "categories": ["Toolkit/Intel® AI Analytics Toolkit/oneDAL"], + "description": "This sample code shows how to train and predict with a distributed linear regression model with the Intel Distribution of Python using the python API package daal4py for oneDAL", + "builder": ["cli"], + "languages": [{"python":{}}], + "dependencies": ["oneDAL"], + "os":["linux"], + "targetDevice": ["CPU"], + "ciTests": { + "linux": [ + { + "env": ["source /opt/intel/oneapi/setvars.sh --force", "source activate base"], + "id": "d4p_Linear_Regression_Dist", + "steps": [ + "mpirun -n 4 python ./daal4py_Distributed_LinearRegression.py" + ] + } + ] +} +} diff --git a/Libraries/oneDAL/daal4py_Getting_Started/Jupyter_Run.jpg b/Libraries/oneDAL/daal4py_Getting_Started/Jupyter_Run.jpg new file mode 100755 index 0000000000..4f54045950 Binary files /dev/null and b/Libraries/oneDAL/daal4py_Getting_Started/Jupyter_Run.jpg differ diff --git a/Libraries/oneDAL/daal4py_Getting_Started/Jupyter_Save_Py.jpg b/Libraries/oneDAL/daal4py_Getting_Started/Jupyter_Save_Py.jpg new file mode 100755 index 0000000000..f4248cddbb Binary files /dev/null and b/Libraries/oneDAL/daal4py_Getting_Started/Jupyter_Save_Py.jpg differ diff --git a/Libraries/oneDAL/daal4py_Getting_Started/License.txt b/Libraries/oneDAL/daal4py_Getting_Started/License.txt new file mode 100644 index 0000000000..a3ab05efce --- /dev/null +++ b/Libraries/oneDAL/daal4py_Getting_Started/License.txt @@ -0,0 +1,8 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +© 2020 GitHub, Inc. \ No newline at end of file diff --git a/Libraries/oneDAL/daal4py_Getting_Started/README.md b/Libraries/oneDAL/daal4py_Getting_Started/README.md new file mode 100755 index 0000000000..8267be8bfb --- /dev/null +++ b/Libraries/oneDAL/daal4py_Getting_Started/README.md @@ -0,0 +1,149 @@ +# daal4py Getting Started +This Getting Started sample code show how to do batch linear regression using the python API package daal4py from oneDAL. It demonstrates how to use software products that can be found in the [Intel oneAPI Data Analytics Library](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onedal.html) or the [Intel AI Analytics Toolkit powered by oneAPI](https://software.intel.com/content/www/us/en/develop/tools/oneapi/ai-analytics-toolkit.html). + +| Optimized for | Description +| :--- | :--- +| OS | 64-bit Linux: Ubuntu 18.04 or higher, 64-bit Windows 10, macOS 10.14 or higher +| Hardware | Intel Atom® Processors; Intel® Core™ Processor Family; Intel® Xeon® Processor Family; Intel® Xeon® Scalable Performance Processor Family +| Software | oneDAL Software Library, Python version 2.7 or >= 3.6, conda-build version >= 3, C++ compiler with C++11 support, Pickle, Pandas, NumPy +| What you will learn | basic oneDAL programming model for Intel CPU +| Time to complete | 5 minutes + +## Purpose + +daal4py is a simplified API to Intel® DAAL that allows for fast usage of the framework suited for Data Scientists or Machine Learning users. Built to help provide an abstraction to Intel® DAAL for either direct usage or integration into one's own framework. + +In this sample you will run a batch Linear Regression model with oneDAL daal4py library memory objects. You will also learn how to train a model and save the information to a file. + +## Key Implementation Details +This Getting Started sample code is implemented for CPU using the Python language. The example assumes you have daal4py and scikit-learn installed inside a conda environment, similar to what is delivered with the installation of the Intel(R) Distribution for Python as part of the [oneAPI AI Analytics Toolkit powered by oneAPI](https://software.intel.com/en-us/oneapi/ai-kit). + +## License +This code sample is licensed under MIT license + +## Building daal4py for CPU + +oneAPI Data Analytics Library is ready for use once you finish the Intel AI Analytics Toolkit installation, and have run the post installation script. + +You can refer to the oneAPI [main page](https://software.intel.com/en-us/oneapi) for toolkit installation, and the Toolkit [Getting Started Guide for Linux](https://software.intel.com/en-us/get-started-with-intel-oneapi-linux-get-started-with-the-intel-ai-analytics-toolkit) for post-installation steps and scripts. + +### Activate conda environment With Root Access + +Please follow the Getting Started Guide steps (above) to set up your oneAPI environment with the setvars.sh script. Then navigate in linux shell to your oneapi installation path, typically `~/intel/inteloneapi`. Intel Python environment will be activte by default. However, if you activated another environment, you can return with the following command: + +#### On a Linux* System +``` +source activate base +``` + +### Activate conda environment Without Root Access (Optional) + +By default, the Intel AI Analytics Toolkit is installed in the inteloneapi folder, which requires root privileges to manage it. If you would like to bypass using root access to manage your conda environment, then you can clone your desired conda environment using the following command: + +#### On a Linux* System +``` +conda create --name user_base --clone base +``` + +Then activate your conda environment with the following command: + +``` +source activate user_base +``` + +### Install Jupyter Notebook + +Launch Jupyter Notebook in the directory housing the code example + +``` +conda install jupyter nb_conda_kernels +``` + +#### View in Jupyter Notebook + +_Note: This distributed execution cannot be launched from the jupyter notebook version, but you can still view inside the notebook to follow the included write-up and description._ + +Launch Jupyter Notebook in the directory housing the code example + +``` +jupyter notebook +``` +## Running the Sample + +### Running the Sample as a Jupyter Notebook + +Open .pynb file and run cells in Jupyter Notebook using the "Run" button (see image) + +![Click the Run Button in the Jupyter Notebook](Jupyter_Run.jpg "Run Button on Jupyter Notebook") + +##### Expected Printed Output for Cells (with similar numbers): +``` +Here's our model: + + + NumberOfBetas: 14 + +NumberOfResponses: 1 + +InterceptFlag: False + +Beta: array( + [[ 0.00000000e+00 -1.05416344e-01 5.25259886e-02 4.26844883e-03 + 2.76607367e+00 -2.82517989e+00 5.49968304e+00 3.48833264e-03 + -8.73247684e-01 1.74005447e-01 -8.38917510e-03 -3.28044397e-01 + 1.58423529e-02 -4.57542900e-01]], + dtype=float64, shape=(1, 14)) + +NumberOfFeatures: 13 + +Here is one of our loaded model's features: + + [[ 0.00000000e+00 -1.05416344e-01 5.25259886e-02 4.26844883e-03 + 2.76607367e+00 -2.82517989e+00 5.49968304e+00 3.48833264e-03 + -8.73247684e-01 1.74005447e-01 -8.38917510e-03 -3.28044397e-01 + 1.58423529e-02 -4.57542900e-01]] +[CODE_SAMPLE_COMPLETED_SUCCESFULLY] +``` + + +### Running the Sample as a Python File + +Open notebook in Jupyter and download as python file + +![Download as python file in the Jupyter Notebook](Jupyter_Save_Py.jpg "Download as python file in the Jupyter Notebook") + +Run the Program + +`python IntelPython_GettingStarted.py` + +The output files of the script will be saved in the included models and results directories. + +##### Expected Printed Output (with similar numbers): +``` +Here's our model: + + + NumberOfBetas: 14 + +NumberOfResponses: 1 + +InterceptFlag: False + +Beta: array( + [[ 0.00000000e+00 -1.05416344e-01 5.25259886e-02 4.26844883e-03 + 2.76607367e+00 -2.82517989e+00 5.49968304e+00 3.48833264e-03 + -8.73247684e-01 1.74005447e-01 -8.38917510e-03 -3.28044397e-01 + 1.58423529e-02 -4.57542900e-01]], + dtype=float64, shape=(1, 14)) + +NumberOfFeatures: 13 + +Here is one of our loaded model's features: + + [[ 0.00000000e+00 -1.05416344e-01 5.25259886e-02 4.26844883e-03 + 2.76607367e+00 -2.82517989e+00 5.49968304e+00 3.48833264e-03 + -8.73247684e-01 1.74005447e-01 -8.38917510e-03 -3.28044397e-01 + 1.58423529e-02 -4.57542900e-01]] +[CODE_SAMPLE_COMPLETED_SUCCESFULLY] +``` + diff --git a/Libraries/oneDAL/daal4py_Getting_Started/daal4py_GettingStarted.ipynb b/Libraries/oneDAL/daal4py_Getting_Started/daal4py_GettingStarted.ipynb new file mode 100755 index 0000000000..521b43af6b --- /dev/null +++ b/Libraries/oneDAL/daal4py_Getting_Started/daal4py_GettingStarted.ipynb @@ -0,0 +1,252 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# =============================================================\n", + "# Copyright © 2020 Intel Corporation\n", + "# \n", + "# SPDX-License-Identifier: MIT\n", + "# =============================================================" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# IntelPython Getting Started Example for Shared Memory Systems" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Importing and Organizing Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example we will be predicting **prices of houses in Boston** based on the features of each house.\n", + "\n", + "Let's start by **importing** all necessary data and packages." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "##### Linear regression example for shared memory systems #####\n", + "import daal4py as d4p\n", + "from sklearn.datasets import load_boston\n", + "from sklearn.model_selection import train_test_split\n", + "import pandas as pd\n", + "import numpy as np\n", + "import pickle" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's **load** in the dataset and **organize** it as necessary to work with our model." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# loading in the data\n", + "data = load_boston()\n", + "\n", + "# organizing variables used in the model for prediction\n", + "X = data.data # house characteristics\n", + "y = data.target[np.newaxis].T # house price\n", + "\n", + "# splitting the data for training and testing, with a 25% test dataset size\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state =1693)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training and Saving the Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's **train our model** and look at the model's features!" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# training the model for prediction\n", + "train_result = d4p.linear_regression_training().compute(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To **get training model information** and **save it to a file**:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Here's our model:\n", + "\n", + "\n", + " NumberOfBetas: 14\n", + "\n", + "NumberOfResponses: 1\n", + "\n", + "InterceptFlag: False\n", + "\n", + "Beta: array(\n", + " [[ 0.00000000e+00 -1.05416344e-01 5.25259886e-02 4.26844883e-03\n", + " 2.76607367e+00 -2.82517989e+00 5.49968304e+00 3.48833264e-03\n", + " -8.73247684e-01 1.74005447e-01 -8.38917510e-03 -3.28044397e-01\n", + " 1.58423529e-02 -4.57542900e-01]],\n", + " dtype=float64, shape=(1, 14))\n", + "\n", + "NumberOfFeatures: 13 \n", + "\n" + ] + } + ], + "source": [ + "# retrieving and printing training model\n", + "model = train_result.model\n", + "print(\"Here's our model:\\n\\n\\n\", model , \"\\n\")\n", + "\n", + "model_filename = './models/linear_regression_batch.sav'\n", + "\n", + "# saving model to a file\n", + "pickle.dump(model, open(model_filename, \"wb\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's **load up the model** and look at one of the model's features." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Here is one of our loaded model's features: \n", + "\n", + " [[ 0.00000000e+00 -1.05416344e-01 5.25259886e-02 4.26844883e-03\n", + " 2.76607367e+00 -2.82517989e+00 5.49968304e+00 3.48833264e-03\n", + " -8.73247684e-01 1.74005447e-01 -8.38917510e-03 -3.28044397e-01\n", + " 1.58423529e-02 -4.57542900e-01]]\n" + ] + } + ], + "source": [ + "# loading the training model from a file\n", + "loaded_model = pickle.load(open(model_filename, \"rb\"))\n", + "print(\"Here is one of our loaded model's features: \\n\\n\", loaded_model.Beta)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Making a Prediction and Saving the Results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Time to **make a prediction!**" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# now predicting the target feature(s) using the trained model\n", + "y_pred = d4p.linear_regression_prediction().compute(X_test, loaded_model).prediction " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's **export the results to a CSV file**." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CODE_SAMPLE_COMPLETED_SUCCESFULLY]\n" + ] + } + ], + "source": [ + "np.savetxt(\"./results/linear_regression_batch_results.csv\", y_pred, delimiter = \",\")\n", + "print(\"[CODE_SAMPLE_COMPLETED_SUCCESFULLY]\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Libraries/oneDAL/daal4py_Getting_Started/daal4py_GettingStarted.py b/Libraries/oneDAL/daal4py_Getting_Started/daal4py_GettingStarted.py new file mode 100755 index 0000000000..1719881fe8 --- /dev/null +++ b/Libraries/oneDAL/daal4py_Getting_Started/daal4py_GettingStarted.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: + + +''' +============================================================= +Copyright © 2020 Intel Corporation + +SPDX-License-Identifier: MIT +============================================================= +''' + +# # IntelPython Getting Started Example for Shared Memory Systems + +# ## Importing and Organizing Data + +# In this example we will be predicting **prices of houses in Boston** based on the features of each house. +# +# Let's start by **importing** all necessary data and packages. + +# In[2]: + + +##### Linear regression example for shared memory systems ##### +import daal4py as d4p +from sklearn.datasets import load_boston +from sklearn.model_selection import train_test_split +import pandas as pd +import numpy as np +import pickle + + +# Now let's **load** in the dataset and **organize** it as necessary to work with our model. + +# In[3]: + + +# loading in the data +data = load_boston() + +# organizing variables used in the model for prediction +X = data.data # house characteristics +y = data.target[np.newaxis].T # house price + +# splitting the data for training and testing, with a 25% test dataset size +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state =1693) + + +# ## Training and Saving the Model + +# Let's **train our model** and look at the model's features! + +# In[4]: + + +# training the model for prediction +train_result = d4p.linear_regression_training().compute(X_train, y_train) + + +# To **get training model information** and **save it to a file**: + +# In[5]: + + +# retrieving and printing training model +model = train_result.model +print("Here's our model:\n\n\n", model , "\n") + +model_filename = './models/linear_regression_batch.sav' + +# saving model to a file +pickle.dump(model, open(model_filename, "wb")) + + +# Now let's **load up the model** and look at one of the model's features. + +# In[6]: + + +# loading the training model from a file +loaded_model = pickle.load(open(model_filename, "rb")) +print("Here is one of our loaded model's features: \n\n", loaded_model.Beta) + + +# ## Making a Prediction and Saving the Results + +# Time to **make a prediction!** + +# In[7]: + + +# now predicting the target feature(s) using the trained model +y_pred = d4p.linear_regression_prediction().compute(X_test, loaded_model).prediction + + +# Now let's **export the results to a CSV file**. + +# In[8]: + + +np.savetxt("./results/linear_regression_batch_results.csv", y_pred, delimiter = ",") +print("[CODE_SAMPLE_COMPLETED_SUCCESFULLY]") + diff --git a/Libraries/oneDAL/daal4py_Getting_Started/models/store_models_in_this_folder.txt b/Libraries/oneDAL/daal4py_Getting_Started/models/store_models_in_this_folder.txt new file mode 100755 index 0000000000..e69de29bb2 diff --git a/Libraries/oneDAL/daal4py_Getting_Started/results/store_results_in_this_folder.txt b/Libraries/oneDAL/daal4py_Getting_Started/results/store_results_in_this_folder.txt new file mode 100755 index 0000000000..e69de29bb2 diff --git a/Libraries/oneDAL/daal4py_Getting_Started/sample.json b/Libraries/oneDAL/daal4py_Getting_Started/sample.json new file mode 100755 index 0000000000..2a255231ce --- /dev/null +++ b/Libraries/oneDAL/daal4py_Getting_Started/sample.json @@ -0,0 +1,22 @@ +{ + "guid": "2E6A2E22-035F-493B-B471-DFD8CF8F8256", + "name": "daal4py Getting Started", + "categories": ["Toolkit/Intel® AI Analytics Toolkit/oneDAL"], + "description": "This Getting Started sample code shows how to do batch linear regression using the python API package daal4py for oneDAL", + "builder": ["cli"], + "languages": [{"python":{}}], + "dependencies": ["oneDAL"], + "os":["linux"], + "targetDevice": ["CPU"], + "ciTests": { + "linux": [ + { + "env": ["source /opt/intel/oneapi/setvars.sh --force", "source activate base"], + "id": "d4p_GS_py", + "steps": [ + "python daal4py_GettingStarted.py" + ] + } + ] +} +} diff --git a/Libraries/oneDPL/gamma-correction/README.md b/Libraries/oneDPL/gamma-correction/README.md index 718412cc40..143148d47b 100644 --- a/Libraries/oneDPL/gamma-correction/README.md +++ b/Libraries/oneDPL/gamma-correction/README.md @@ -5,7 +5,7 @@ Gamma correction is a nonlinear operation used to encode and decode the luminanc |---------------------------------|----------------------------------------------------------------------------------| | OS | Linux* Ubuntu* 18.04, Windows 10 | | Hardware | Skylake with GEN9 or newer | -| Software | Intel® oneAPI DPC++ Compiler beta; Intel® oneAPI DPC++ Library (oneDPL) | +| Software | Intel® oneAPI DPC++/C++ Compiler; Intel® oneAPI DPC++ Library (oneDPL) | | What you will learn | How to offload the computation to GPU using Intel® oneAPI DPC++ Library | | Time to complete | At most 5 minutes | diff --git a/Libraries/oneDPL/gamma-correction/sample.json b/Libraries/oneDPL/gamma-correction/sample.json index e2d46465e3..d99ff901c7 100644 --- a/Libraries/oneDPL/gamma-correction/sample.json +++ b/Libraries/oneDPL/gamma-correction/sample.json @@ -1,6 +1,6 @@ { "name": "Gamma Correction", - "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/oneAPI DPC++ Library/CPU and GPU"], + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/Intel® oneAPI DPC++/C++ Compiler/oneAPI DPC++ Library/CPU and GPU"], "description": "gamma correction - a nonlinear operation used to encode and decode the luminance of each image pixel.", "toolchain": ["dpcpp"], "languages": [{"cpp":{}}], diff --git a/Libraries/oneDPL/stable_sort_by_key/README.md b/Libraries/oneDPL/stable_sort_by_key/README.md index d6c12c5c84..4ed1222c3b 100644 --- a/Libraries/oneDPL/stable_sort_by_key/README.md +++ b/Libraries/oneDPL/stable_sort_by_key/README.md @@ -7,7 +7,7 @@ Stable sort by key is a sorting operation when sorting of 2 sequences (keys and |---------------------------------|----------------------------------------------------------------------------------| | OS | Linux* Ubuntu* 18.04 | | Hardware | Skylake with GEN9 or newer | -| Software | Intel® oneAPI DPC++ Compiler beta; Intel® oneAPI DPC++ Library (oneDPL) | +| Software | Intel® oneAPI DPC++/C++ Compiler; Intel® oneAPI DPC++ Library (oneDPL) | | What you will learn | How to use `counting_iterator` and `zip_iterator` | | Time to complete | At most 5 minutes | diff --git a/Libraries/oneDPL/stable_sort_by_key/sample.json b/Libraries/oneDPL/stable_sort_by_key/sample.json index 66e315d599..a4c803bb7d 100644 --- a/Libraries/oneDPL/stable_sort_by_key/sample.json +++ b/Libraries/oneDPL/stable_sort_by_key/sample.json @@ -1,6 +1,6 @@ { "name": "Stable sort by key", - "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/oneAPI DPC++ Library/CPU and GPU"], + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/Intel® oneAPI DPC++/C++ Compiler/oneAPI DPC++ Library/CPU and GPU"], "description": "It models stable sort by key: during the sorting of 2 sequences (keys and values) only keys are compared but both keys and values are swapped", "toolchain": ["dpcpp"], "languages": [{"cpp":{}}], diff --git a/README.md b/README.md index 1fe987a98d..ee7e072fe3 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,72 @@ -oneAPI-samples -This is the readme. +|Code Sample |Supported Intel(r) Architecture(s) |Description | +|-----------------------|-------------------------------------------|---------------| +|DirectPrograming/ | +|../DPC++/CombinationalLogic/Mandelbrot |GPU, CPU |Example of a fractal in mathematics | +|../DPC++/CombinationalLogic/Sepia-filter |GPU, CPU |Color image conversion using 1D range | +|../DPC++/DenseLinearAlgebra/Complex_mult |GPU, CPU |Complex number Multiplication | +|../DPC++/DenseLinearAlgebra/Matrix_mul |GPU, CPU |Simple program that multiplies two large matrices in parallel using DPC++, OpenMP and MKL | +|../DPC++/DenseLinearAlgebra/Simple-add |FPGA, GPU, CPU |Simple Add program | +|../DPC++/DenseLinearAlgebra/Vector-add |FPGA, GPU, CPU |Simple Vector add program | +|../DPC++/GraphTraversal/Bitonic-sort |GPU, CPU |Implementation of bitonic sort using DPC++. | +|../DPC++/ParallelPatterns/Dpc_reduce |GPU, CPU |A simple program that calculates pi, implemented using C++ and DPC++. | +|../DPC++/SpectralMethods/Discrete-cosine-transform |GPU, CPU |Image processing algorithm used in JPEG compression | +|../DPC++/StructuredGrids/1d_HeatTransfer |GPU, CPU |A simulation of one dimensional heat transfer process using DPC++. | +|../DPC++/StructuredGrids/ISO2DFD_DPCPP |GPU, CPU |A simple finite difference stencil kernel for solving 2D acoustic isotropic wave equation using DPC++ | +|../DPC++/StructuredGrids/ISO3DFD_DPCPP |GPU, CPU |A finite difference stencil kernel for solving 3D acoustic isotropic wave equation using DPC++ | +|../DPC++/StructuredGrids/Particle-diffusion |GPU, CPU |A simple implementation of a Monte Carlo simulation of the diffusion of water molecules in tissue | +|../DPC++FPGA/ReferenceDesigns/crr |FPGA |High-performance CRR binomial tree option pricing model using DPC++ on FPGA| +|../DPC++FPGA/ReferenceDesigns/gzip |FPGA |High-performance GZIP compression using DPC++ on FPGA| +|../DPC++FPGA/ReferenceDesigns/qrd |FPGA |High-performance QR decomposition of matrices using DPC++ on FPGA| +|../DPC++FPGA/Tutorials/GettingStarted/fpga_compile |FPGA |Tutorial introducing how to compile DPC++ for FPGA | +|../DPC++FPGA/Tutorials/GettingStarted/fast_recompile |FPGA |Tutorial introducing host-only recompile to save DPC++ development time on FPGA | +|../DPC++FPGA/Tutorials/Tools/use_library |FPGA |Tutorial showing how to use cross-language libraries in DPC++ on FPGA | +|../DPC++FPGA/Tutorials/Tools/system_profiling |FPGA |Tutorial showing how to use the OpenCL Intercept Layer to profile DPC++ designs running on FPGA | +|../DPC++FPGA/Tutorials/DesignPatterns/double_buffering |FPGA |Tutorial demonstrating how to overlap kernel execution with buffer transfers and host processing | +|../DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering |FPGA |Tutorial demonstrating an extension of double buffering to n-way buffering | +|../DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache |FPGA |Tutorial demonstrating the caching of on-chip memory to reduce loop initiation interval on FPGA | +|../DPC++FPGA/Tutorials/DesignPatterns/pipe_array |FPGA |Tutorial demonstrating how to create an array of pipes | +|../DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency |FPGA |Tutorial demonstrating a technique to optimize performance by removing loop carried dependencies | +|../DPC++FPGA/Tutorials/DesignPatterns/triangular_loop |FPGA |Tutorial demonstrating an advanced FPGA optimization technique for triangular loops | +|../DPC++FPGA/Tutorials/Features/fpga_reg |FPGA |Tutorial demonstrating the use of the DPC++ FPGA power user extension intel::fpga_reg | +|../DPC++FPGA/Tutorials/Features/kernel_args_restrict |FPGA |Tutorial demonstrating how to avoid performance penalties due to kernel argument aliasing | +|../DPC++FPGA/Tutorials/Features/loop_coalesce |FPGA |Tutorial demonstrating the DPC++ FPGA loop_coalesce attribute | +|../DPC++FPGA/Tutorials/Features/loop_ivdep |FPGA |Tutorial demonstrating the use of the loop ivdep attribute | +|../DPC++FPGA/Tutorials/Features/loop_unroll |FPGA |Tutorial demonstrating the DPC++ unroll pragma and its performance trade-offs on FPGA | +|../DPC++FPGA/Tutorials/Features/max_concurrency |FPGA |Tutorial demonstrating the DPC++ FPGA max_concurrency attribute | +|../DPC++FPGA/Tutorials/Features/memory_attributes |FPGA |Tutorial demonstrating how to use DPC++ FPGA memory attributes | +|../DPC++FPGA/Tutorials/Features/pipes |FPGA |Tutorial demonstrating the DPC++ FPGA pipes extension to transfer data between kernels | +|../DPC++FPGA/Tutorials/Features/speculated_iterations |FPGA |Tutorial demonstrating the DPC++ FPGA speculated_iterations attribute | +|../C++/CombinationalLogic/Mandelbrot |CPU |Demonstrates how to accelerate Mandelbrot performance with SIMD and parallelization using OpenMP*. | +|../C++/CompilerInfrastructure/Intrinsics |CPU |Shows how to utilize the intrinsics supported by C++ compiler in a variety of applications. | +|../C++/GraphTraversal/Mergesort |CPU |Shows how to accelerate scalar merge sort program using OpenMP tasks | +|Libraries | +|../oneDPL/Gamma-correction |GPU, CPU |gamma correction using Parallel STL | +|../oneDPL/Stable_sort_by_key |GPU, CPU |stable sort by key using counting_iterator and zip_iterator | +|../oneVPL/hello-decode |CPU |shows how to use oneVPL to perform a simple video decode | +|../oneVPL/hello-encode |CPU |shows how to use oneVPL to perform a simple video encode | +|Tools | +|../ApplicationDebugger/Debugger/array-transform |GPU, CPU |Array transform | +|../IoTConnectionTools/Analog-in |CPU |Analog pin input example using Eclipse* MRAA | +|../IoTConnectionTools/Digital In |CPU |GPIO pin input example using Eclipse* MRAA | +|../IoTConnectionTools/Digital Out |CPU |GPIO pin output example using Eclipse* MRAA | +|../IoTConnectionTools/Hello IoT World |CPU |Basic example that prints the compiler used during build | +|../IoTConnectionTools/Interrupt |CPU |Interrupt Service Routine example using Eclipse* MRAA | +|../IoTConnectionTools/Onboard Blink |CPU |Built-in LED blink for common IoT boards using Eclipse* MRAA | +|../IoTConnectionTools/PWM |CPU |Pulse Width Modulation pin output using Eclipse* MRAA | +|../IoTConnectionTools/Up2 LEDs |CPU |Built-in LED example for UP* Squared using Eclipse* MRAA | +|../SystemDebug/System Debug Sample Build |UEFI |Basic example that showcases the features of the Intel® System Debugger | + +# License + +The code samples are licensed under MIT license + +# Known issues or limitations + +## On Windows Platform +1. If you are using Visual Studio 2019, Visual Studio 2019 version 16.4.0 or newer is required. +2. To build samples on Windows, the required Windows SDK is ver. 10.0.17763.0. +3. Now you should be able to build the code sample. +4. For beta, FPGA samples support Windows through FPGA-emulator. +5. If you encounter a compilation error like below when building a sample program, one reason is that the directory path of the sample is too long; the work around is to move the sample to a directory like "c:\temp\sample_name". + * Error MSB6003 The specified task executable "dpcpp.exe" could not be run ...... + diff --git a/Tools/ApplicationDebugger/array-transform/README.md b/Tools/ApplicationDebugger/array-transform/README.md index 255cfe8957..89023a1429 100644 --- a/Tools/ApplicationDebugger/array-transform/README.md +++ b/Tools/ApplicationDebugger/array-transform/README.md @@ -93,7 +93,14 @@ system. If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, FPGA) as well whether to run in -batch or interactive mode. For more information see the Intel® oneAPI +batch or interactive mode. For the array transform sample, a node +with GPU and an interactive shell is recommended. + +``` +$ qsub -I -l nodes=1:gpu:ppn=2 +``` + +For more information see the Intel® oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/). diff --git a/Tools/IoTConnectionTools/aws-pub-sub/CMakeLists.txt b/Tools/IoTConnectionTools/aws-pub-sub/CMakeLists.txt new file mode 100644 index 0000000000..89690f2a2a --- /dev/null +++ b/Tools/IoTConnectionTools/aws-pub-sub/CMakeLists.txt @@ -0,0 +1,82 @@ +cmake_minimum_required(VERSION 3.1) +project(basic-pub-sub CXX) + +option(BUILD_DEPS "Builds aws common runtime dependencies as part of build to control your dependency chain." ON) + +if (DEFINED CMAKE_PREFIX_PATH) + file(TO_CMAKE_PATH "${CMAKE_PREFIX_PATH}" CMAKE_PREFIX_PATH) +endif() + +if (DEFINED CMAKE_INSTALL_PREFIX) + file(TO_CMAKE_PATH "${CMAKE_INSTALL_PREFIX}" CMAKE_INSTALL_PREFIX) +endif() + +if (NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 11) +endif() + +if (UNIX AND NOT APPLE) + include(GNUInstallDirs) +elseif(NOT DEFINED CMAKE_INSTALL_LIBDIR) + set(CMAKE_INSTALL_LIBDIR "lib") +endif() + +if (${CMAKE_INSTALL_LIBDIR} STREQUAL "lib64") + set(FIND_LIBRARY_USE_LIB64_PATHS true) +endif() + +# This is required in order to append /lib/cmake to each element in CMAKE_PREFIX_PATH +set(AWS_MODULE_DIR "/${CMAKE_INSTALL_LIBDIR}/cmake") +string(REPLACE ";" "${AWS_MODULE_DIR};" AWS_MODULE_PATH "${CMAKE_PREFIX_PATH}${AWS_MODULE_DIR}") +# Append that generated list to the module search path +list(APPEND CMAKE_MODULE_PATH ${AWS_MODULE_PATH}) + +if (NOT DEFINED CMAKE_BUILD_TYPE) + if (NOT WIN32) + set(CMAKE_BUILD_TYPE "RelWithDebInfo") + endif() +endif() + +list(APPEND CMAKE_MODULE_PATH "$ENV{HOME}/sdk-cpp-workspace/aws-iot-device-sdk-cpp-v2/aws-common-runtime/aws-crt-cpp/aws-common-runtime/aws-c-common/cmake") + +include(AwsFindPackage) +set(IN_SOURCE_BUILD ON) +set(BUILD_TESTING_PREV ${BUILD_TESTING}) +set(BUILD_TESTING OFF) + +find_path(CRT_CPP_LIB aws-crt-cpp PATHS "$ENV{HOME}/sdk-cpp-workspace/aws-iot-device-sdk-cpp-v2/aws-common-runtime") +set(CRT_PATH ${CRT_CPP_LIB}/aws-crt-cpp) +add_subdirectory(${CRT_PATH} build EXCLUDE_FROM_ALL) +set(BUILD_TESTING ${BUILD_TESTING_PREV}) + + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_INSTALL_PREFIX}/lib/cmake") + +file(GLOB PUB_SUB_SRC + "cpp/*.cpp" +) + +set(PUB_SUB_PROJECT_NAME basic-pub-sub) +add_executable(${PUB_SUB_PROJECT_NAME} ${PUB_SUB_SRC}) +set_target_properties(${PUB_SUB_PROJECT_NAME} PROPERTIES LINKER_LANGUAGE CXX) + +set(CMAKE_C_FLAGS_DEBUGOPT "") + +#set warnings +if (MSVC) + target_compile_options(${PUB_SUB_PROJECT_NAME} PRIVATE /W4 /WX /wd4068) +else () + target_compile_options(${PUB_SUB_PROJECT_NAME} PRIVATE -Wall -Wno-long-long -pedantic -Werror) +endif () + +if (CMAKE_BUILD_TYPE STREQUAL "" OR CMAKE_BUILD_TYPE MATCHES Debug) + target_compile_definitions(${PUB_SUB_PROJECT_NAME} PRIVATE "-DDEBUG_BUILD") +endif () + +target_include_directories(${PUB_SUB_PROJECT_NAME} PUBLIC + $ + $) + +aws_use_package(aws-crt-cpp) +target_link_libraries(${PUB_SUB_PROJECT_NAME} ${DEP_AWS_LIBS}) + diff --git a/Tools/IoTConnectionTools/aws-pub-sub/License.txt b/Tools/IoTConnectionTools/aws-pub-sub/License.txt new file mode 100644 index 0000000000..e63c6e13dc --- /dev/null +++ b/Tools/IoTConnectionTools/aws-pub-sub/License.txt @@ -0,0 +1,7 @@ +Copyright Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/Tools/IoTConnectionTools/aws-pub-sub/README.md b/Tools/IoTConnectionTools/aws-pub-sub/README.md new file mode 100644 index 0000000000..1db51d3803 --- /dev/null +++ b/Tools/IoTConnectionTools/aws-pub-sub/README.md @@ -0,0 +1,69 @@ +# `AWS Pub Sub` Sample + +`AWS Pub Sub` is a sample that could be used for a quick test of Amazon cloud libraries. + + +| Optimized for | Description +|:--- |:--- +| OS | Linux* Ubuntu* 16.04, Linux* Ubuntu* 18.04 +| Software | C++ 11 or higher, CMake 3.1+, Clang 3.9+ or GCC 4.4+, AWS IoT Device SDK C++ v2 +| What you will learn | Use the Message Broker for AWS IoT to send and receive messages through an MQTT connection + + +This version of the sample has been tested on Ubuntu Linux. This sample requires additional system configuration when using Ubuntu OS. Instructions on how to install the custom provided all dependency libraries for Linux can be [found here](). + +## Purpose +`AWS Pub Sub` is a simple program that helps user to execute the example of the AWS code and toconfigure and run Amazon Cloud services. + +## Key Implementation Details +This sample uses the Message Broker for AWS IoT to send and receive messages through an MQTT connection. + +##License +This sample is licensed under Apache License v2.0 + +## Building the `AWS Pub Sub` + +### On a Linux System + +Perform the following steps: +1. Run in the terminal: +``` +cd $HOME +mkdir sdk-cpp-workspace +cd sdk-cpp-workspace +git clone --recursive https://github.com/aws/aws-iot-device-sdk-cpp-v2.git +mkdir aws-iot-device-sdk-cpp-v2-build +cd aws-iot-device-sdk-cpp-v2-build +cmake -DCMAKE_INSTALL_PREFIX="" -DCMAKE_PREFIX_PATH="" -DBUILD_DEPS=ON ../aws-iot-device-sdk-cpp-v2 +cmake --build . --target install +``` + +2. To execute the sample that had been built run in the terminal: +``` +basic-pub-sub --endpoint --cert --key --topic --ca_file --use_websocket --signing_region --proxy_host --proxy_port +``` + +3. Clean the program using: + +``` +make clean +``` + + +## Running the Sample +### Application Parameters + +endpoint: the endpoint of the mqtt server not including a port +cert: path to your client certificate in PEM format. If this is not set you must specify use_websocket +key: path to your key in PEM format. If this is not set you must specify use_websocket +topic: topic to publish, subscribe to. +client_id: client id to use (optional) +ca_file: Optional, if the mqtt server uses a certificate that's not already in your trust store, set this. + It's the path to a CA file in PEM format +use_websocket: if specified, uses a websocket over https (optional) +signing_region: used for websocket signer it should only be specific if websockets are used. (required for websockets) +proxy_host: if you want to use a proxy with websockets, specify the host here (optional). +proxy_port: defaults to 8080 is proxy_host is set. Set this to any value you'd like (optional). + +### Example of Output +TBD diff --git a/Tools/IoTConnectionTools/aws-pub-sub/cpp/main.cpp b/Tools/IoTConnectionTools/aws-pub-sub/cpp/main.cpp new file mode 100644 index 0000000000..72d262bcc5 --- /dev/null +++ b/Tools/IoTConnectionTools/aws-pub-sub/cpp/main.cpp @@ -0,0 +1,374 @@ +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ +#include +#include + +#include + +#include +#include +#include +#include +#include + +using namespace Aws::Crt; + +static void s_printHelp() +{ + fprintf(stdout, "Usage:\n"); + fprintf( + stdout, + "basic-pub-sub --endpoint --cert " + " --key --topic --ca_file " + " --use_websocket --signing_region --proxy_host --proxy_port \n\n"); + fprintf(stdout, "endpoint: the endpoint of the mqtt server not including a port\n"); + fprintf( + stdout, + "cert: path to your client certificate in PEM format. If this is not set you must specify use_websocket\n"); + fprintf(stdout, "key: path to your key in PEM format. If this is not set you must specify use_websocket\n"); + fprintf(stdout, "topic: topic to publish, subscribe to.\n"); + fprintf(stdout, "client_id: client id to use (optional)\n"); + fprintf( + stdout, + "ca_file: Optional, if the mqtt server uses a certificate that's not already" + " in your trust store, set this.\n"); + fprintf(stdout, "\tIt's the path to a CA file in PEM format\n"); + fprintf(stdout, "use_websocket: if specified, uses a websocket over https (optional)\n"); + fprintf( + stdout, + "signing_region: used for websocket signer it should only be specific if websockets are used. (required for " + "websockets)\n"); + fprintf(stdout, "proxy_host: if you want to use a proxy with websockets, specify the host here (optional).\n"); + fprintf( + stdout, "proxy_port: defaults to 8080 is proxy_host is set. Set this to any value you'd like (optional).\n\n"); +} + +bool s_cmdOptionExists(char **begin, char **end, const String &option) +{ + return std::find(begin, end, option) != end; +} + +char *s_getCmdOption(char **begin, char **end, const String &option) +{ + char **itr = std::find(begin, end, option); + if (itr != end && ++itr != end) + { + return *itr; + } + return 0; +} + +int main(int argc, char *argv[]) +{ + + /************************ Setup the Lib ****************************/ + /* + * Do the global initialization for the API. + */ + ApiHandle apiHandle; + + String endpoint; + String certificatePath; + String keyPath; + String caFile; + String topic; + String clientId(Aws::Crt::UUID().ToString()); + String signingRegion; + String proxyHost; + uint16_t proxyPort(8080); + + bool useWebSocket = false; + + /*********************** Parse Arguments ***************************/ + if (!(s_cmdOptionExists(argv, argv + argc, "--endpoint") && s_cmdOptionExists(argv, argv + argc, "--topic"))) + { + s_printHelp(); + return 0; + } + + endpoint = s_getCmdOption(argv, argv + argc, "--endpoint"); + + if (s_cmdOptionExists(argv, argv + argc, "--key")) + { + keyPath = s_getCmdOption(argv, argv + argc, "--key"); + } + + if (s_cmdOptionExists(argv, argv + argc, "--cert")) + { + certificatePath = s_getCmdOption(argv, argv + argc, "--cert"); + } + + topic = s_getCmdOption(argv, argv + argc, "--topic"); + if (s_cmdOptionExists(argv, argv + argc, "--ca_file")) + { + caFile = s_getCmdOption(argv, argv + argc, "--ca_file"); + } + if (s_cmdOptionExists(argv, argv + argc, "--client_id")) + { + clientId = s_getCmdOption(argv, argv + argc, "--client_id"); + } + if (s_cmdOptionExists(argv, argv + argc, "--use_websocket")) + { + if (!s_cmdOptionExists(argv, argv + argc, "--signing_region")) + { + s_printHelp(); + } + useWebSocket = true; + signingRegion = s_getCmdOption(argv, argv + argc, "--signing_region"); + + if (s_cmdOptionExists(argv, argv + argc, "--proxy_host")) + { + proxyHost = s_getCmdOption(argv, argv + argc, "--proxy_host"); + } + + if (s_cmdOptionExists(argv, argv + argc, "--proxy_port")) + { + proxyPort = static_cast(atoi(s_getCmdOption(argv, argv + argc, "--proxy_port"))); + } + } + + /********************** Now Setup an Mqtt Client ******************/ + /* + * You need an event loop group to process IO events. + * If you only have a few connections, 1 thread is ideal + */ + Io::EventLoopGroup eventLoopGroup(1); + if (!eventLoopGroup) + { + fprintf( + stderr, "Event Loop Group Creation failed with error %s\n", ErrorDebugString(eventLoopGroup.LastError())); + exit(-1); + } + + Aws::Crt::Io::DefaultHostResolver defaultHostResolver(eventLoopGroup, 1, 5); + Io::ClientBootstrap bootstrap(eventLoopGroup, defaultHostResolver); + + if (!bootstrap) + { + fprintf(stderr, "ClientBootstrap failed with error %s\n", ErrorDebugString(bootstrap.LastError())); + exit(-1); + } + + Aws::Iot::MqttClientConnectionConfigBuilder builder; + + if (!certificatePath.empty() && !keyPath.empty()) + { + builder = Aws::Iot::MqttClientConnectionConfigBuilder(certificatePath.c_str(), keyPath.c_str()); + } + else if (useWebSocket) + { + Aws::Iot::WebsocketConfig config(signingRegion, &bootstrap); + + if (!proxyHost.empty()) + { + Aws::Crt::Http::HttpClientConnectionProxyOptions proxyOptions; + proxyOptions.HostName = proxyHost; + proxyOptions.Port = proxyPort; + proxyOptions.AuthType = Aws::Crt::Http::AwsHttpProxyAuthenticationType::None; + config.ProxyOptions = std::move(proxyOptions); + } + + builder = Aws::Iot::MqttClientConnectionConfigBuilder(config); + } + else + { + s_printHelp(); + } + + if (!caFile.empty()) + { + builder.WithCertificateAuthority(caFile.c_str()); + } + + builder.WithEndpoint(endpoint); + + auto clientConfig = builder.Build(); + + if (!clientConfig) + { + fprintf( + stderr, + "Client Configuration initialization failed with error %s\n", + ErrorDebugString(clientConfig.LastError())); + exit(-1); + } + + Aws::Iot::MqttClient mqttClient(bootstrap); + /* + * Since no exceptions are used, always check the bool operator + * when an error could have occurred. + */ + if (!mqttClient) + { + fprintf(stderr, "MQTT Client Creation failed with error %s\n", ErrorDebugString(mqttClient.LastError())); + exit(-1); + } + + /* + * Now create a connection object. Note: This type is move only + * and its underlying memory is managed by the client. + */ + auto connection = mqttClient.NewConnection(clientConfig); + + if (!connection) + { + fprintf(stderr, "MQTT Connection Creation failed with error %s\n", ErrorDebugString(mqttClient.LastError())); + exit(-1); + } + + /* + * In a real world application you probably don't want to enforce synchronous behavior + * but this is a sample console application, so we'll just do that with a condition variable. + */ + std::mutex mutex; + std::condition_variable conditionVariable; + bool connectionSucceeded = false; + bool connectionClosed = false; + bool connectionCompleted = false; + + /* + * This will execute when an mqtt connect has completed or failed. + */ + auto onConnectionCompleted = [&](Mqtt::MqttConnection &, int errorCode, Mqtt::ReturnCode returnCode, bool) { + if (errorCode) + { + fprintf(stdout, "Connection failed with error %s\n", ErrorDebugString(errorCode)); + std::lock_guard lockGuard(mutex); + connectionSucceeded = false; + } + else + { + fprintf(stdout, "Connection completed with return code %d\n", returnCode); + connectionSucceeded = true; + } + { + std::lock_guard lockGuard(mutex); + connectionCompleted = true; + } + conditionVariable.notify_one(); + }; + + auto onInterrupted = [&](Mqtt::MqttConnection &, int error) { + fprintf(stdout, "Connection interrupted with error %s\n", ErrorDebugString(error)); + }; + + auto onResumed = [&](Mqtt::MqttConnection &, Mqtt::ReturnCode, bool) { fprintf(stdout, "Connection resumed\n"); }; + + /* + * Invoked when a disconnect message has completed. + */ + auto onDisconnect = [&](Mqtt::MqttConnection &) { + { + fprintf(stdout, "Disconnect completed\n"); + std::lock_guard lockGuard(mutex); + connectionClosed = true; + } + conditionVariable.notify_one(); + }; + + connection->OnConnectionCompleted = std::move(onConnectionCompleted); + connection->OnDisconnect = std::move(onDisconnect); + connection->OnConnectionInterrupted = std::move(onInterrupted); + connection->OnConnectionResumed = std::move(onResumed); + + connection->SetOnMessageHandler([](Mqtt::MqttConnection &, const String &topic, const ByteBuf &payload) { + fprintf(stdout, "Generic Publish received on topic %s, payload:\n", topic.c_str()); + fwrite(payload.buffer, 1, payload.len, stdout); + fprintf(stdout, "\n"); + }); + + /* + * Actually perform the connect dance. + * This will use default ping behavior of 1 hour and 3 second timeouts. + * If you want different behavior, those arguments go into slots 3 & 4. + */ + fprintf(stdout, "Connecting...\n"); + if (!connection->Connect(clientId.c_str(), false, 1000)) + { + fprintf(stderr, "MQTT Connection failed with error %s\n", ErrorDebugString(connection->LastError())); + exit(-1); + } + + std::unique_lock uniqueLock(mutex); + conditionVariable.wait(uniqueLock, [&]() { return connectionCompleted; }); + + if (connectionSucceeded) + { + /* + * This is invoked upon the receipt of a Publish on a subscribed topic. + */ + auto onPublish = [&](Mqtt::MqttConnection &, const String &topic, const ByteBuf &byteBuf) { + fprintf(stdout, "Publish received on topic %s\n", topic.c_str()); + fprintf(stdout, "\n Message:\n"); + fwrite(byteBuf.buffer, 1, byteBuf.len, stdout); + fprintf(stdout, "\n"); + }; + + /* + * Subscribe for incoming publish messages on topic. + */ + auto onSubAck = [&](Mqtt::MqttConnection &, uint16_t packetId, const String &topic, Mqtt::QOS, int errorCode) { + if (packetId) + { + fprintf(stdout, "Subscribe on topic %s on packetId %d Succeeded\n", topic.c_str(), packetId); + } + else + { + fprintf(stdout, "Subscribe failed with error %s\n", aws_error_debug_str(errorCode)); + } + conditionVariable.notify_one(); + }; + + connection->Subscribe(topic.c_str(), AWS_MQTT_QOS_AT_LEAST_ONCE, onPublish, onSubAck); + conditionVariable.wait(uniqueLock); + + while (true) + { + String input; + fprintf( + stdout, + "Enter the message you want to publish to topic %s and press enter. Enter 'exit' to exit this " + "program.\n", + topic.c_str()); + std::getline(std::cin, input); + + if (input == "exit") + { + break; + } + + ByteBuf payload = ByteBufNewCopy(DefaultAllocator(), (const uint8_t *)input.data(), input.length()); + ByteBuf *payloadPtr = &payload; + + auto onPublishComplete = [payloadPtr](Mqtt::MqttConnection &, uint16_t packetId, int errorCode) { + aws_byte_buf_clean_up(payloadPtr); + + if (packetId) + { + fprintf(stdout, "Operation on packetId %d Succeeded\n", packetId); + } + else + { + fprintf(stdout, "Operation failed with error %s\n", aws_error_debug_str(errorCode)); + } + }; + connection->Publish(topic.c_str(), AWS_MQTT_QOS_AT_LEAST_ONCE, false, payload, onPublishComplete); + } + + /* + * Unsubscribe from the topic. + */ + connection->Unsubscribe( + topic.c_str(), [&](Mqtt::MqttConnection &, uint16_t, int) { conditionVariable.notify_one(); }); + conditionVariable.wait(uniqueLock); + } + + /* Disconnect */ + if (connection->Disconnect()) + { + conditionVariable.wait(uniqueLock, [&]() { return connectionClosed; }); + } + return 0; +} diff --git a/Tools/IoTConnectionTools/aws-pub-sub/sample.json b/Tools/IoTConnectionTools/aws-pub-sub/sample.json new file mode 100644 index 0000000000..0c118243b1 --- /dev/null +++ b/Tools/IoTConnectionTools/aws-pub-sub/sample.json @@ -0,0 +1,22 @@ +{ + "guid": "479AD17C-27E9-42F0-8CB1-14B48D098829", + "name": "AWS Pub Sub", + "categories": ["Toolkit/Intel® oneAPI IoT Toolkit/IoT Connection Tools"], + "description": "This sample uses the Message Broker for AWS IoT to send and receive messages through an MQTT connection.", + "dependencies": ["aws-iot-device-sdk-cpp-v2|https://github.com/aws/aws-iot-device-sdk-cpp-v2"], + "languages": [{"cpp":{}}], + "os": ["linux"], + "ciTests": { + "linux": [ + { "id": "aws-pub-sub", + "env": [], + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make" + ] + } + ] + } +}