diff --git a/DirectProgramming/C++/CombinationalLogic/MandelbrotOMP/README.md b/DirectProgramming/C++/CombinationalLogic/MandelbrotOMP/README.md
index 63dbe2063b..b7e29823b0 100644
--- a/DirectProgramming/C++/CombinationalLogic/MandelbrotOMP/README.md
+++ b/DirectProgramming/C++/CombinationalLogic/MandelbrotOMP/README.md
@@ -5,9 +5,9 @@ Mandelbrot is an infinitely complex fractal patterning that is derived from a si
| Optimized for | Description
|:--- |:---
-| OS | MacOS Catalina or newer; Linux* Ubuntu* 18.04
+| OS | MacOS Catalina or newer;
| Hardware | Skylake with GEN9 or newer
-| Software | Intel® C++ Compiler 19.1 or newer
+| Software | Intel® oneAPI C++ Compiler Classic
| What you will learn | How to optimize a scalar implementation using OpenMP pragmas
| Time to complete | 15 minutes
diff --git a/DirectProgramming/C++/CombinationalLogic/MandelbrotOMP/sample.json b/DirectProgramming/C++/CombinationalLogic/MandelbrotOMP/sample.json
index ece8ab4756..9bf6d60004 100644
--- a/DirectProgramming/C++/CombinationalLogic/MandelbrotOMP/sample.json
+++ b/DirectProgramming/C++/CombinationalLogic/MandelbrotOMP/sample.json
@@ -2,16 +2,12 @@
"name": "Mandelbrot OpenMP*",
"description": "Calculates the mandelbrot set and outputs a bmp image representation using OpenMP*",
"categories": ["Toolkit/Intel® oneAPI HPC Toolkit"],
- "os": ["linux", "darwin"],
- "builder": ["cmake"],
+ "os": ["darwin"],
+ "builder": ["make"],
"languages": [{"cpp":{}}],
"toolchain": ["icc"],
"guid": "DD113F58-4D91-41BB-B46E-6CF2C0D9F6F9",
"ciTests": {
- "linux": [
- { "id": "standard", "steps": [ "make", "make run", "make clean" ] },
- { "id": "perf_num", "env": [ "export perf_num=1" ], "steps": [ "make", "make run", "make clean" ] }
- ],
"darwin": [
{ "id": "standard", "steps": [ "make", "make run", "make clean" ] },
{ "id": "perf_num", "env": [ "export perf_num=1" ], "steps": [ "make", "make run", "make clean" ] }
diff --git a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/README.md b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/README.md
index 50e0f51b90..a99d5b006c 100644
--- a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/README.md
+++ b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/README.md
@@ -4,10 +4,10 @@ The intrinsic samples are designed to show how to utilize the intrinsics support
| Optimized for | Description
|:--- |:---
-| OS | Linux* Ubuntu* 18.04; MacOS* Catalina* or newer
+| OS | MacOS* Catalina* or newer
| Hardware | Skylake with GEN9 or newer
-| Software | Intel® C++ Compiler 2021.1 or newer;
-| What you will learn | How to utlize intrinsics supported by the Intel® C++ Compiler
+| Software | Intel® oneAPI C++ Compiler Classic
+| What you will learn | How to utlize intrinsics supported by the Intel® oneAPI C++ Compiler Classic
| Time to complete | 15 minutes
diff --git a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/sample.json b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/sample.json
index 8bc2fbc314..40360e7968 100644
--- a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/sample.json
+++ b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/sample.json
@@ -1,17 +1,13 @@
{
"name": "Intrinsics C++",
- "description": "Demonstrates the intrinsic functions of the Intel® C++ Compiler",
+ "description": "Demonstrates the intrinsic functions of the Intel® oneAPI C++ Compiler Classic",
"categories": ["Toolkit/Intel® oneAPI HPC Toolkit"],
- "os": ["linux", "darwin"],
- "builder": ["cmake"],
+ "os": ["darwin"],
+ "builder": ["make"],
"languages": [{"cpp":{}}],
"toolchain": ["icc"],
"guid": "ACD0E89E-67CC-4CB4-87AB-B12B84962EAF",
"ciTests": {
- "linux": [
- { "id": "standard", "steps": [ "make", "make run", "make clean" ] },
- { "id": "debug", "steps": [ "make debug", "make debug_run", "make clean" ] }
- ],
"darwin": [
{ "id": "standard", "steps": [ "make", "make run", "make clean" ] },
{ "id": "debug", "steps": [ "make debug", "make debug_run", "make clean" ] }
diff --git a/DirectProgramming/C++/GraphTraversal/MergesortOMP/README.md b/DirectProgramming/C++/GraphTraversal/MergesortOMP/README.md
index ce51161a1a..43356ac52a 100644
--- a/DirectProgramming/C++/GraphTraversal/MergesortOMP/README.md
+++ b/DirectProgramming/C++/GraphTraversal/MergesortOMP/README.md
@@ -6,10 +6,10 @@ For more details about merge sort algorithm and top-down implementation, please
| Optimized for | Description
|:--- |:---
-| OS | Linux* Ubuntu* 18.04; MacOS Catalina or newer
+| OS | MacOS Catalina or newer
| Hardware | Skylake with GEN9 or newer
-| Software | Intel® C++ Compiler 19.1 or newer;
-| What you will learn | How to accelerate a scalar program using OpenMP tasks
+| Software | Intel® oneAPI C++ Compiler Classic
+| What you will learn | How to accelerate a scalar program using OpenMP* tasks
| Time to complete | 15 minutes
Performance number tabulation
diff --git a/DirectProgramming/C++/GraphTraversal/MergesortOMP/sample.json b/DirectProgramming/C++/GraphTraversal/MergesortOMP/sample.json
index cde821978e..a58affeae8 100644
--- a/DirectProgramming/C++/GraphTraversal/MergesortOMP/sample.json
+++ b/DirectProgramming/C++/GraphTraversal/MergesortOMP/sample.json
@@ -2,16 +2,12 @@
"name": "MergeSort C++/OpenMP*",
"description": "Classic sorting algorithm using OpenMP*",
"categories": ["Toolkit/Intel® oneAPI HPC Toolkit"],
- "os": ["linux", "darwin"],
- "builder": ["cmake"],
+ "os": ["darwin"],
+ "builder": ["make"],
"languages": [{"cpp":{}}],
"toolchain": ["icc"],
"guid": "5AFED65F-F725-411D-B21C-B59008D1166D",
"ciTests": {
- "linux": [
- { "id": "standard", "steps": [ "make", "make run", "make clean" ] },
- { "id": "perf_num", "env": [ "export perf_num=1" ], "steps": [ "make", "make run", "make clean" ] }
- ],
"darwin": [
{ "id": "standard", "steps": [ "make", "make run", "make clean" ] },
{ "id": "perf_num", "env": [ "export perf_num=1" ], "steps": [ "make", "make run", "make clean" ] }
diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/License.txt b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/License.txt
index 9cde07f558..8f608e972a 100644
--- a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/License.txt
+++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/License.txt
@@ -1,4 +1,4 @@
-Copyright Intel Corporation
+Copyright 2019 Intel Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/README.md b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/README.md
index 312bb4e783..53da36b8b1 100644
--- a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/README.md
+++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/README.md
@@ -8,8 +8,8 @@ For comprehensive instructions regarding DPC++ Programming, go to https://softwa
|:--- |:---
| OS | Linux* Ubuntu* 18.04; Windows 10
| Hardware | Skylake with GEN9 or newer
-| Software | Intel® oneAPI DPC++ Compiler beta;
-| What you will learn | How to offload the computation to GPU using Intel DPC++ compiler
+| Software | Intel® oneAPI DPC++/C++ Compiler
+| What you will learn | How to offload the computation to GPU using the Intel® oneAPI DPC++/C++ Compiler
| Time to complete | 15 minutes
## Purpose
diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj
index 19bac293d5..8a4eaa9d40 100644
--- a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj
+++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj
@@ -114,7 +114,6 @@
Console
true
- $(ONEAPI_ROOT)\compiler\latest\windows\bin\libsycl-complex.o
@@ -152,10 +151,9 @@
true
true
true
- $(ONEAPI_ROOT)\compiler\latest\windows\bin\libsycl-complex.o
-
\ No newline at end of file
+
diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/CMakeLists.txt b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/CMakeLists.txt
index 9cd8f8f64d..4c3d57303d 100644
--- a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/CMakeLists.txt
+++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/CMakeLists.txt
@@ -2,10 +2,10 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -std=c++17")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
add_executable(mandelbrot main.cpp)
-target_link_libraries(mandelbrot OpenCL sycl $ENV{ONEAPI_ROOT}/compiler/latest/linux/lib/libsycl-complex.o)
-add_custom_target(run ${CMAKE_COMMAND} -E env SYCL_BE=PI_OPENCL ./mandelbrot)
+target_link_libraries(mandelbrot OpenCL sycl)
+add_custom_target(run ./mandelbrot)
add_executable(mandelbrot_usm main.cpp)
target_compile_definitions(mandelbrot_usm PRIVATE MANDELBROT_USM)
-target_link_libraries(mandelbrot_usm OpenCL sycl $ENV{ONEAPI_ROOT}/compiler/latest/linux/lib/libsycl-complex.o)
-add_custom_target(run_usm ${CMAKE_COMMAND} -E env SYCL_BE=PI_OPENCL ./mandelbrot_usm)
+target_link_libraries(mandelbrot_usm OpenCL sycl)
+add_custom_target(run_usm ./mandelbrot_usm)
diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/mandel.hpp b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/mandel.hpp
index 991478032c..7c261a5e56 100644
--- a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/mandel.hpp
+++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/mandel.hpp
@@ -33,6 +33,10 @@ struct MandelParameters {
int max_iterations_;
typedef std::complex ComplexF;
+ static std::complex complex_square( std::complex c)
+ {
+ return std::complex( c.real()*c.real() - c.imag()*c.imag(), c.real()*c.imag()*2 );
+ }
MandelParameters(int row_count, int col_count, int max_iterations)
: row_count_(row_count),
@@ -41,7 +45,7 @@ struct MandelParameters {
int row_count() const { return row_count_; }
int col_count() const { return col_count_; }
- int max_iterations() const { return max_iterations_; }
+int max_iterations() const { return max_iterations_; }
// Scale from 0..row_count to -1.5..0.5
float ScaleRow(int i) const { return -1.5f + (i * (2.0f / row_count_)); }
@@ -63,7 +67,8 @@ struct MandelParameters {
break;
}
- z = z * z + c;
+ // z = z * z + c;
+ z = complex_square(z) + c;
count++;
}
diff --git a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/README.md b/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/README.md
index 759b7e1576..db05d53647 100644
--- a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/README.md
+++ b/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/README.md
@@ -7,8 +7,8 @@ For comprehensive instructions regarding DPC++ Programming, go to https://softwa
|:--- |:---
| OS | Linux Ubuntu 18.04, Windows 10
| Hardware | Skylake with GEN9 or newer
-| Software | Intel® oneAPI DPC++ Compiler (beta)
-| What you will learn | The Sepia Filter sample demonstrates the following using the oneAPI DPC++ compiler Writing a custom device selector class Offloading compute intensive parts of the application using both lamba and functor kernels Measuring kernel execution time by enabling profiling
+| Software | Intel® oneAPI DPC++/C++ Compiler
+| What you will learn | The Sepia Filter sample demonstrates the following using the Intel® oneAPI DPC++/C++ Compiler Writing a custom device selector class Offloading compute intensive parts of the application using both lamba and functor kernels Measuring kernel execution time by enabling profiling
| Time to complete | 20 minutes
## Purpose
diff --git a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/sample.json b/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/sample.json
index 6abd3d250f..e2ff514d31 100644
--- a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/sample.json
+++ b/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/sample.json
@@ -1,7 +1,7 @@
{
"guid": "B9C425DB-A3AD-4FCB-9CA0-1909E5189FB7",
"name": "Sepia Filter",
- "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU"],
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/Intel® oneAPI DPC++/C++ Compiler/CPU and GPU"],
"toolchain": ["dpcpp"],
"description": "A program that converts an image to sepia tone",
"languages": [{
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/complex_mult/README.md b/DirectProgramming/DPC++/DenseLinearAlgebra/complex_mult/README.md
index 4ef647b606..246791a6fd 100644
--- a/DirectProgramming/DPC++/DenseLinearAlgebra/complex_mult/README.md
+++ b/DirectProgramming/DPC++/DenseLinearAlgebra/complex_mult/README.md
@@ -11,7 +11,7 @@ custom types of classes in a DPC++ program
|:--- |:---
| OS | Linux Ubuntu 18.04, Windows 10
| Hardware | Skylake with GEN9 or newer
-| Software | Intel® oneAPI DPC++ Compiler (beta)
+| Software | Intel® oneAPI DPC++/C++ Compiler
| What you will learn | Using custom type classes and offloads complex number computations to GPU using Intel DPC++
| Time to complete | 15 minutes
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/complex_mult/sample.json b/DirectProgramming/DPC++/DenseLinearAlgebra/complex_mult/sample.json
index 5b2c4309a1..2824cf8808 100644
--- a/DirectProgramming/DPC++/DenseLinearAlgebra/complex_mult/sample.json
+++ b/DirectProgramming/DPC++/DenseLinearAlgebra/complex_mult/sample.json
@@ -1,7 +1,7 @@
{
"guid": "D725E06E-0ECE-44F8-910D-AD1A8C89ED89",
"name": "Complex number Multiplication",
- "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ],
+ "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/Intel® oneAPI DPC++/C++ Compiler/CPU and GPU" ],
"description": "program that computes the multiplication of a Complex number",
"toolchain": [ "dpcpp" ],
"languages": [ { "cpp": { "properties": { "projectOptions": [ { "projectType": "makefile" } ] } } } ],
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/README.md b/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/README.md
index c50970d237..34eebdfd9e 100644
--- a/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/README.md
+++ b/DirectProgramming/DPC++/DenseLinearAlgebra/matrix_mul/README.md
@@ -10,7 +10,7 @@ For comprehensive instructions regarding DPC++ Programming, go to https://softwa
|:--- |:---
| OS | Linux* Ubuntu* 18.04, Windows 10*
| Hardware | Skylake with GEN9 or newer
-| Software | Intel® oneAPI DPC++ Compiler beta, Intel® C/C++ Compiler beta
+| Software | Intel® oneAPI DPC++/C++ Compiler, Intel® oneAPI C++ Compiler Classic
| What you will learn | Offloads computations on 2D arrays to GPU using Intel DPC++ and OpenMP
| Time to complete | 15 minutes
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/README.md b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/README.md
index 662c9df298..7c156e79f7 100644
--- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/README.md
+++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/README.md
@@ -8,7 +8,7 @@ For comprehensive instructions regarding DPC++ Programming, go to https://softwa
|:--- |:---
| OS | Linux* Ubuntu* 18.04, Windows 10
| Hardware | Skylake with GEN9 or newer, Intel(R) Programmable Acceleration Card with Intel(R) Arria(R) 10 GX FPGA
-| Software | Intel® oneAPI DPC++ Compiler (beta)
+| Software | Intel® oneAPI DPC++/C++ Compiler
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/sample.json b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/sample.json
index cb7d58bb6a..619d872475 100644
--- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/sample.json
+++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/sample.json
@@ -1,7 +1,7 @@
{
"guid" : "49C65CB6-F9FA-4E3C-B8BE-4A141E4E0F07",
"name": "Simple Add",
- "categories": ["Toolkit/Get Started", "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU, GPU and FPGA"],
+ "categories": ["Toolkit/Get Started", "Toolkit/Intel® oneAPI Base Toolkit/Intel® oneAPI DPC++/C++ Compiler/CPU, GPU and FPGA"],
"description": "Simple program that adds two large vectors in parallel. Provides a ‘Hello World!’ like sample to ensure your environment is setup correctly using Data Parallel C++.",
"toolchain": ["dpcpp"],
"languages": [{"cpp": {"properties": {"projectOptions": [{"projectType": "makefile"}]}}}],
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/vector-add/README.md b/DirectProgramming/DPC++/DenseLinearAlgebra/vector-add/README.md
index 9f32169505..ba8a52deaa 100644
--- a/DirectProgramming/DPC++/DenseLinearAlgebra/vector-add/README.md
+++ b/DirectProgramming/DPC++/DenseLinearAlgebra/vector-add/README.md
@@ -8,7 +8,7 @@ For comprehensive instructions regarding DPC++ Programming, go to https://softwa
|:--- |:---
| OS | Linux* Ubuntu* 18.04, Windows 10
| Hardware | Skylake with GEN9 or newer, Intel(R) Programmable Acceleration Card with Intel(R) Arria(R) 10 GX FPGA
-| Software | Intel® oneAPI DPC++ Compiler (beta)
+| Software | Intel® oneAPI DPC++/C++ Compiler
## Purpose
The `vector-add` is a simple program that adds two large vectors of integers and verifies the results. This program is implemented using C++ and Data Parallel C++ (DPC++) for Intel(R) CPU and accelerators.
diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/vector-add/sample.json b/DirectProgramming/DPC++/DenseLinearAlgebra/vector-add/sample.json
index 9737eea2fb..f86a214617 100644
--- a/DirectProgramming/DPC++/DenseLinearAlgebra/vector-add/sample.json
+++ b/DirectProgramming/DPC++/DenseLinearAlgebra/vector-add/sample.json
@@ -1,7 +1,7 @@
{
"guid":"b1b58be7-e22e-4ca2-ba59-6887b2f1be6c",
"name": "Vector Add",
- "categories": ["Toolkit/Get Started", "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU, GPU and FPGA"],
+ "categories": ["Toolkit/Get Started", "Toolkit/Intel® oneAPI Base Toolkit/Intel® oneAPI DPC++/C++ Compiler/CPU, GPU and FPGA"],
"description": "Simple program that adds two large vectors in parallel. Provides a ‘Hello World!’ like sample to ensure your environment is setup correctly using simple Data Parallel C++.",
"toolchain": ["dpcpp"],
"languages": [{"cpp": {"properties": {"projectOptions": [{"projectType": "makefile"}]}}}],
diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md
index 061f753ed0..3e28e7c495 100644
--- a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md
+++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md
@@ -13,8 +13,8 @@ and search based on relevant terms noted in the comments.
|:--- |:---
| OS | Linux Ubuntu 18.04
| Hardware | Skylake with GEN9 or newer
-| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta)
-| What you will learn | Implement bitonic sort using Intel DPC++ compiler
+| Software | Intel® oneAPI DPC++/C++ Compiler
+| What you will learn | Implement bitonic sort using Intel® oneAPI DPC++/C++ Compiler
| Time to complete | 15 minutes
@@ -51,7 +51,7 @@ if a compatible GPU is not detected.
## Key Implementation Details
The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command g
-roups. Unified Shared Memory (USM) is used for data management.
+roups. Unified Shared Memory (USM) and Buffer Object are used for data management.
## License
This code sample is licensed under MIT license
@@ -117,7 +117,10 @@ the ascending order is verified, the application will display a “Success!” m
$ ./bitonic-sort 21 47
Array size: 2097152, seed: 47
Device: Intel(R) Gen9 HD Graphics NEO
-Kernel time: 0.416827 sec
-CPU serial time: 0.60523 sec
+Warm up ...
+Kernel time using USM: 0.248422 sec
+Kernel time using buffer allocation: 0.253364 sec
+CPU serial time: 0.628803 sec
+
Success!
```
diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json
index c382d764e1..75efdfa0f0 100644
--- a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json
+++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json
@@ -1,7 +1,7 @@
{
"guid": "4D5B57B8-6F34-4A11-89F5-3F07E766DB39",
"name": "bitonic-sort",
- "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ],
+ "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/Intel® oneAPI DPC++/C++ Compiler/CPU and GPU" ],
"description": "Bitonic Sort using Intel® oneAPI DPC++ Language",
"toolchain": [ "dpcpp" ],
"targetDevice": [ "CPU", "GPU" ],
diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp
index e0e4312520..0153bf4cd1 100644
--- a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp
+++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp
@@ -35,38 +35,93 @@
// data to the kernel. The kernel swaps the elements accordingly in parallel.
//
#include
-#include
#include
+// dpc_common.hpp can be found in the dev-utilities include folder.
+// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp
+#include "dpc_common.hpp"
+
using namespace sycl;
using namespace std;
-void ParallelBitonicSort(int a[], int n, queue &q) {
+#define DEBUG 0
+
+void ParallelBitonicSort(int data_gpu[], int n, queue &q) {
// n: the exponent used to set the array size. Array size = power(2, n)
int size = pow(2, n);
+ int* a = data_gpu;
+
+ // step from 0, 1, 2, ...., n-1
+ for (int step = 0; step < n; step++) {
+ // for each step s, stage goes s, s-1, ..., 0
+ for (int stage = step; stage >= 0; stage--) {
+ int seq_len = pow(2, stage + 1);
+
+ // Constant used in the kernel: 2**(step-stage).
+ int two_power = 1 << (step - stage);
+ // Offload the work to kernel.
+ q.submit([&](handler &h) {
+ h.parallel_for(range<1>(size), [=](id<1> i) {
+ // Assign the bitonic sequence number.
+ int seq_num = i / seq_len;
+
+ // Variable used to identified the swapped element.
+ int swapped_ele = -1;
+
+ // Because the elements in the first half in the bitonic
+ // sequence may swap with elements in the second half,
+ // only the first half of elements in each sequence is
+ // required (seq_len/2).
+ int h_len = seq_len / 2;
+
+ if (i < (seq_len * seq_num) + h_len) swapped_ele = i + h_len;
+
+ // Check whether increasing or decreasing order.
+ int odd = seq_num / two_power;
+
+ // Boolean variable used to determine "increasing" or
+ // "decreasing" order.
+ bool increasing = ((odd % 2) == 0);
+
+ // Swap the elements in the bitonic sequence if needed
+ if (swapped_ele != -1) {
+ if (((a[i] > a[swapped_ele]) && increasing) ||
+ ((a[i] < a[swapped_ele]) && !increasing)) {
+ int temp = a[i];
+ a[i] = a[swapped_ele];
+ a[swapped_ele] = temp;
+ }
+ }
+ });
+ });
+ q.wait();
+ } // end stage
+ } // end step
+}
+
+void ParallelBitonicSortBuffer(int data_gpu[], int n, queue &q) {
+ // n: the exponent used to set the array size. Array size = power(2, n)
+ int size = pow(2, n);
+
+ buffer input (data_gpu, size);
+
// step from 0, 1, 2, ...., n-1
for (int step = 0; step < n; step++) {
// for each step s, stage goes s, s-1, ..., 0
for (int stage = step; stage >= 0; stage--) {
- // In each state, construct a number (num_seq) of bitonic sequences of
- // size seq_len (2, 4, ...) num_seq stores the number of bitonic sequences
- // at each stage. seq_len stores the length of the bitonic sequence at
- // each stage.
int seq_len = pow(2, stage + 1);
-#if DEBUG
- int num_seq = pow(2, (n - stage - 1)); // Used for debug purpose.
- std::cout << "step num:" << step << " stage num:" << stage
- << " num_seq:" << num_seq << "(" << seq_len << ") => ";
-#endif
+
// Constant used in the kernel: 2**(step-stage).
int two_power = 1 << (step - stage);
// Offload the work to kernel.
q.submit([&](handler &h) {
- h.parallel_for(range<1>(size), [=](id<1> i) {
+ auto a = input.get_access(h);
+
+ h.parallel_for(range<1>(size), [=](id<1> i) {
// Assign the bitonic sequence number.
- int seq_num = i / seq_len;
+ int seq_num = i / seq_len;
// Variable used to identified the swapped element.
int swapped_ele = -1;
@@ -190,40 +245,62 @@ int main(int argc, char *argv[]) {
std::cout << "Device: " << q.get_device().get_info()
<< "\n";
+ // Memory allocated for host access only.
+ int *data_cpu = (int *)malloc(size * sizeof(int));
+
// USM allocation using malloc_shared: data stores a sequence of random
// numbers.
- int *data = malloc_shared(size, q);
+ int *data_usm = malloc_shared(size, q);
- // Memory allocated for host access only.
- int *data2 = (int *)malloc(size * sizeof(int));
+ // Memory allocated to store gpu results using buffer allocation
+ int *data_gpu = (int *)malloc(size * sizeof(int));
// Initialize the array randomly using a seed.
srand(seed);
- for (int i = 0; i < size; i++) data[i] = data2[i] = rand() % 1000;
+ for (int i = 0; i < size; i++)
+ data_usm[i] = data_gpu[i] = data_cpu[i] = rand() % 1000;
#if DEBUG
std::cout << "\ndata before:\n";
- DisplayArray(data, size);
+ DisplayArray(data_usm, size);
#endif
+ // Warm up
+ std::cout << "Warm up ...\n";
+ ParallelBitonicSort(data_usm, n, q);
+
// Start timer
dpc_common::TimeInterval t_par;
- ParallelBitonicSort(data, n, q);
+ // Parallel sort using USM
+ ParallelBitonicSort(data_usm, n, q);
- std::cout << "Kernel time: " << t_par.Elapsed() << " sec\n";
+ std::cout << "Kernel time using USM: " << t_par.Elapsed() << " sec\n";
#if DEBUG
- std::cout << "\ndata after sorting using parallel bitonic sort:\n";
- DisplayArray(data, size);
+ std::cout << "\ndata_usm after sorting using parallel bitonic sort:\n";
+ DisplayArray(data_usm, size);
#endif
+ // Start timer
+ dpc_common::TimeInterval t_par2;
+
+ // Parallel sort using buffer allocation
+ ParallelBitonicSortBuffer(data_gpu, n, q);
+
+ std::cout << "Kernel time using buffer allocation: " << t_par2.Elapsed() << " sec\n";
+
+#if DEBUG
+ std::cout << "\ndata_gpu after sorting using parallel bitonic sort:\n";
+ DisplayArray(data_gpu, size);
+#endif
+
// Start timer
dpc_common::TimeInterval t_ser;
// Bitonic sort in CPU (serial)
- BitonicSort(data2, n);
+ BitonicSort(data_cpu, n);
std::cout << "CPU serial time: " << t_ser.Elapsed() << " sec\n";
@@ -231,18 +308,22 @@ int main(int argc, char *argv[]) {
bool pass = true;
for (int i = 0; i < size - 1; i++) {
// Validate the sequence order is increasing in both kernel and CPU.
- if ((data[i] > data[i + 1]) || (data[i] != data2[i])) {
+ if ((data_usm[i] > data_usm[i + 1]) || (data_usm[i] != data_cpu[i])) {
pass = false;
break;
}
+
+ if ((data_gpu[i] > data_gpu[i + 1]) || (data_gpu[i] != data_cpu[i])) {
+ pass = false;
+ break;
+ }
}
- // Clean USM resources.
- free(data, q);
-
- // Clean CPU memory.
- free(data2);
-
+ // Clean resources.
+ free(data_cpu);
+ free(data_usm, q);
+ free(data_gpu);
+
if (!pass) {
std::cout << "\nFailed!\n";
return -2;
diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/CMakeLists.txt b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/CMakeLists.txt
new file mode 100644
index 0000000000..07ec9bb778
--- /dev/null
+++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/CMakeLists.txt
@@ -0,0 +1,30 @@
+# required cmake version
+cmake_minimum_required(VERSION 3.5)
+
+project (hidden-markov-models)
+
+if(WIN32)
+ set(CMAKE_CXX_COMPILER "dpcpp-cl")
+else()
+ set(CMAKE_CXX_COMPILER "dpcpp")
+endif()
+
+# Set default build type to RelWithDebInfo if not specified
+if (NOT CMAKE_BUILD_TYPE)
+ message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info")
+ set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE
+ STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel"
+ FORCE)
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17")
+
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl")
+
+add_executable (hidden-markov-models src/hidden-markov-models.cpp)
+
+add_custom_target (run
+ COMMAND hidden-markov-models
+ WORKING_DIRECTORY ${CMAKE_PROJECT_DIR}
+)
+
diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/License.txt b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/License.txt
new file mode 100644
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/README.md b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/README.md
new file mode 100644
index 0000000000..8a880848c6
--- /dev/null
+++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/README.md
@@ -0,0 +1,89 @@
+#`DPC++ Hidden Markov Model` Sample
+The HMM (Hidden Markov Model) sample presents a statistical model using a Markov process to present graphable nodes that are otherwise in an unobservable state or “hidden”. This technique is helpful in pattern recognition such as speech, handwriting, gesture recognition, part-of-speech tagging, partial discharges and bioinformatics. The sample offloads the complexity of the Markov process to the GPU.
+
+The directed edges of this graph are possible transitions beetween nodes or states defined with the following parameters: the number of states is N, the transition matrix A is a square matrix of size N. Each element with indexes (i,j) of this matrix determines the probability to move from the state i to the state j on any step of the Markov process (i and j can be the same if the state does not change on the taken step).
+
+The main assumption of the HMM is that there are visible observations that depend on the current Markov process. That dependency can be described as a conditional probability distribution (represented by emission matrix). The problem is to find out the most likely chain of the hidden Markov states using the given observations set.
+
+##Requirements and sample info
+
+| Optimized for | Description
+|:--- |:---
+| OS | Linux* Ubuntu* 18.04, Windows 10
+| Hardware | Skylake with GEN9 or newer,
+| Software | Intel® oneAPI DPC++ Compiler (beta)
+| What you will learn | Implement Viterbi algorithm to get the most likely path that consists of the hidden states
+| Time to complete | 1 minute
+
+##Purpose
+
+The sample can use GPU offload to compute sequential steps of multiple graph traversals simultaneously.
+
+This code sample implements the Viterbi algorithm which is a dynamic programming algorithm for finding the most likely sequence of hidden states—called the Viterbi path—that results in a sequence of observed events, especially in the context of Markov information sources and HMM.
+
+- Initially, the dataset for algorithm processing is generated: initial states probability distribution Pi, transition matrix A, emission matrix B and the sequence or the observations produced by hidden Markov process.
+- First, the matrix of Viterbi values on the first states are initialized using distribution Pi and emission matrix B. The matrix of back pointers is initialized with default values -1.
+- Then, for each time step the Viterbi matrix is set to the maximal possible value using A, B and Pi.
+- Finally, the state with maximum Viterbi value on the last step is set as a final state of the Viterbi path and the previous nodes of this path are detemined using the correspondent rows of back pointers matrix for each of the steps except the last one.
+
+Note: The implementation uses logarithms of the probabilities to process small numbers correctly and to replace multiplication operations with addition operations.
+
+##Key Implementation details
+
+The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command groups.
+
+## License
+This code sample is licensed under MIT license.
+
+## Building the `DPC++ Hidden Markov Model` Program for CPU and GPU
+
+### Include Files
+The include folder is located at %ONEAPI_ROOT%\dev-utilities\latest\include on your development system.
+
+### On a Linux* System
+1. Build the program using the following `cmake` commands.
+ ```
+ $ cd hidden-markov-models
+ $ mkdir build
+ $ cd build
+ $ cmake ..
+ $ make
+ ```
+
+2. Run the program:
+ ```
+ make run
+ ```
+
+3. Clean the program using:
+ ```
+ make clean
+ ```
+
+### On a Windows* System Using a Command Line Interface
+ * Build the program using VS2017 or VS2019
+ Right click on the solution file and open using either VS2017 or VS2019 IDE.
+ Right click on the project in Solution explorer and select Rebuild.
+ From top menu select Debug -> Start without Debugging.
+
+ * Build the program using MSBuild
+ Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for
+ VS2019"
+ Run - MSBuild hidden-markov-models.sln /t:Rebuild /p:Configuration="Release"
+
+### On a Windows* System Using Visual Studio* Version 2017 or Newer
+Perform the following steps:
+1. Locate and select the `hidden-markov-models.sln` file.
+2. Select the configuration 'Debug' or 'Release'.
+3. Select **Project** > **Build** menu option to build the selected configuration.
+4. Select **Debug** > **Start Without Debugging** menu option to run the program.
+
+## Running the Sample
+### Application Parameters
+There are no editable parameters for this sample.
+
+### Example of Output
+Device: Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz Intel(R) OpenCL
+The Viterbi path is:
+19 18 17 16 15 14 13 12 11 10
+The sample completed successfully!
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.filters b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.filters
new file mode 100644
index 0000000000..5f08be7fdb
--- /dev/null
+++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.filters
@@ -0,0 +1,22 @@
+
+
+
+
+ {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
+ cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx
+
+
+ {93995380-89BD-4b04-88EB-625FBE52EBFB}
+ h;hh;hpp;hxx;hm;inl;inc;ipp;xsd
+
+
+ {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
+ rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
+
+
+
+
+ Source Files
+
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.sln b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.sln
new file mode 100644
index 0000000000..10106f9039
--- /dev/null
+++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.sln
@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.30320.27
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "hidden-markov-models", "hidden-markov-models.vcxproj", "{46454D0B-76F3-45EB-A186-F315A2E22DEA}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|x64 = Debug|x64
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Debug|x64.ActiveCfg = Debug|x64
+ {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Debug|x64.Build.0 = Debug|x64
+ {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Release|x64.ActiveCfg = Release|x64
+ {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ SolutionGuid = {B1D84B81-F5D5-4459-AA6E-38B695FB908B}
+ EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.user b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.user
new file mode 100644
index 0000000000..fa6ed154c1
--- /dev/null
+++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.user
@@ -0,0 +1,9 @@
+
+
+
+ WindowsLocalDebugger
+
+
+ WindowsLocalDebugger
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.vcxproj b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.vcxproj
new file mode 100644
index 0000000000..e894a8cca6
--- /dev/null
+++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.vcxproj
@@ -0,0 +1,144 @@
+
+
+
+
+ Debug
+ x64
+
+
+ Release
+ x64
+
+
+
+
+
+
+ 15.0
+ {46454d0b-76f3-45eb-a186-f315a2e22dea}
+ Win32Proj
+ hidden-markov-models
+ $(WindowsSDKVersion.Replace("\",""))
+ hidden-markov-models
+
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true
+
+
+ true
+
+
+ false
+
+
+ false
+
+
+
+
+
+
+
+
+
+ Console
+ true
+
+
+
+
+
+
+
+
+ %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories)
+ Disabled
+ Level3
+
+
+ Console
+ true
+ /Od;%(SpecifyDevCmplAdditionalOptions)
+
+
+
+
+
+
+
+
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+
+
+
+
+ %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories)
+ Disabled
+ Level3
+
+
+ Console
+ true
+ true
+ true
+ /Od;%(SpecifyDevCmplAdditionalOptions)
+
+
+
+
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.vcxproj.user b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.vcxproj.user
new file mode 100644
index 0000000000..e631a72cce
--- /dev/null
+++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/hidden-markov-models.vcxproj.user
@@ -0,0 +1,17 @@
+
+
+
+ cpu
+ WindowsLocalDebugger
+ CL_CONFIG_USE_NATIVE_DEBUGGER=1
+SYCL_DEVICE_TYPE=CPU
+$(LocalDebuggerEnvironment)
+
+
+ cpu
+ WindowsLocalDebugger
+ CL_CONFIG_USE_NATIVE_DEBUGGER=1
+SYCL_DEVICE_TYPE=CPU
+$(LocalDebuggerEnvironment)
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/sample.json b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/sample.json
new file mode 100644
index 0000000000..6dadf9de3f
--- /dev/null
+++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/sample.json
@@ -0,0 +1,29 @@
+{
+ "guid": "A63E408B-75ED-4379-A6B5-AF013C0EBA58",
+ "name": "hidden-markov-models",
+ "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ],
+ "description": "Bitonic Sort using Intel® oneAPI DPC++ Language",
+ "toolchain": [ "dpcpp" ],
+ "targetDevice": [ "CPU", "GPU" ],
+ "languages": [ { "cpp": {} } ],
+ "os": [ "linux", "windows" ],
+ "builder": [ "ide", "cmake" ],
+ "ciTests": {
+ "linux": [{
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make",
+ "make run"
+ ]
+ }],
+ "windows": [{
+ "steps": [
+ "MSBuild hidden-markov-models.sln /t:Rebuild /p:Configuration=\"Release\"",
+ "cd x64/Release",
+ "hidden-markov-models.exe"
+ ]
+ }]
+ }
+}
diff --git a/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/src/hidden-markov-models.cpp b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/src/hidden-markov-models.cpp
new file mode 100644
index 0000000000..6b2e91a8c6
--- /dev/null
+++ b/DirectProgramming/DPC++/GraphTraversal/hidden-markov-models/src/hidden-markov-models.cpp
@@ -0,0 +1,189 @@
+//==============================================================
+// Copyright © Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Hidden Markov Models: this code sample implements the Viterbi algorithm which is a dynamic
+// programming algorithm for findingthe most likely sequence of hidden states—
+// called the Viterbi path—that results in a sequence of observed events,
+// especially in the context of Markov information sources and HMM.
+//
+// The sample can use GPU offload to compute sequential steps of multiple graph traversals simultaneously.
+//
+// - Initially, the dataset for algorithm processing is generated : initial states probability
+// distribution Pi, transition matrix A, emission matrix Band the sequence or the observations
+// produced by hidden Markov process.
+// - First, the matrix of Viterbi values on the first states are initialized using distribution Pi
+// and emission matrix B.The matrix of back pointers is initialized with default values - 1.
+// - Then, for each time step the Viterbi matrix is set to the maximal possible value using A, B and Pi.
+// - Finally, the state with maximum Viterbi value on the last step is set as a final state of
+// the Viterbi pathand the previous nodes of this path are detemined using the correspondent rows
+// of back pointers matrix for each of the steps except the last one.
+//
+// Note: The implementation uses logarithms of the probabilities to process small numbers correctly
+// and to replace multiplication operations with addition operations.
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+// dpc_common.hpp can be found in the dev-utilities include folder.
+// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp
+#include "dpc_common.hpp"
+
+using namespace sycl;
+using namespace std;
+
+// Matrix size constants.
+// The number of hidden states N.
+constexpr int N = 20;
+// The number of possible observations M.
+constexpr int M = 20;
+// The lenght of the hidden states sequence T.
+constexpr int T = 20;
+// The parameter for generating the sequence.
+constexpr int seed = 0;
+// Minimal double to initialize logarithms for Viterbi values equal to 0.
+constexpr double MIN_DOUBLE = -1.0 * std::numeric_limits::max();
+
+bool ViterbiCondition(double x, double y, double z, double compare);
+
+int main() {
+ try {
+ // Initializing and generating initial probabilities for the hidden states.
+ double(*pi) = new double[N];
+ for (int i = 0; i < N; ++i) {
+ pi[i] = sycl::log10(1.0f / N);
+ }
+ buffer pi_buf(pi, N);
+
+ //Device initialization.
+ queue q(default_selector{}, dpc_common::exception_handler);
+ cout << "Device: " << q.get_device().get_info() << " "
+ << q.get_device().get_platform().get_info() << "\n";
+
+ //Buffers initialization.
+ buffer viterbi(range<2>(N, T));
+ buffer back_pointer(range<2>(N, T));
+ buffer a(range<2>(N, N));
+ buffer b(range<2>(N, M));
+
+ // Generating transition matrix A for the Markov process.
+ q.submit([&](handler& h) {
+ auto a_acc = a.get_access(h);
+ h.parallel_for(range<2>(N, N), [=](id<2> index) {
+ // The sum of the probabilities in each row of the matrix A has to be equal to 1.
+ double prob = 1.0f / N;
+ // The algorithm computes logarithms of the probability values to improve small numbers processing.
+ a_acc[index] = sycl::log10(prob);
+ });
+ });
+
+ // Generating emission matrix B for the Markov process.
+ q.submit([&](handler& h) {
+ auto b_acc = b.get_access(h);
+ h.parallel_for(range<2>(N, M), [=](id<2> index) {
+ // The sum of the probabilities in each row of the matrix B has to be equal to 1.
+ double prob = ((index[0] + index[1]) % M) * 2.0f / M / (M - 1);
+ // The algorithm computes logarithms of the probability values to improve small numbers processing.
+ b_acc[index] = (prob == 0.0f) ? MIN_DOUBLE : sycl::log10(prob);
+ });
+ });
+
+ // Generating the sequence of the observations produced by the hidden Markov chain.
+ int(*seq) = new int[T];
+ for (int i = 0; i < T; ++i) {
+ seq[i] = (i * i + seed) % M;
+ }
+ buffer seq_buf(seq, T);
+
+ // Initialization of the Viterbi matrix and the matrix of back pointers.
+ q.submit([&](handler& h) {
+ auto v_acc = viterbi.get_access(h);
+ auto b_ptr_acc = back_pointer.get_access(h);
+ auto b_acc = b.get_access(h);
+ auto pi_acc = pi_buf.get_access(h);
+ auto seq_acc = seq_buf.get_access(h);
+ h.parallel_for(range<2>(N, T), [=](id<2> index) {
+ int i = index[0];
+ int j = index[1];
+ // At starting point only the first Viterbi values are defined and these Values are substituted
+ // with logarithms due to the following equation: log(x*y) = log(x) + log(y).
+ v_acc[index] = (j != 0) ? MIN_DOUBLE : pi_acc[i] + b_acc[i][seq_acc[0]];
+ // Default values of all the back pointers are (-1) to show that they are not determined yet.
+ b_ptr_acc[index] = -1;
+ });
+ });
+ delete[] pi;
+
+ // The sequential steps of the Viterbi algorithm that define the Viterbi matrix and the matrix
+ // of back pointers. The product of the Viterbi values and the probabilities is substituted with the sum of
+ // the logarithms due to the following equation: log (x*y*z) = log(x) + log(y) + log(z).
+ for (int j = 0; j < T - 1; ++j) {
+ q.submit([&](handler& h) {
+ auto v_acc = viterbi.get_access(h);
+ auto b_ptr_acc = back_pointer.get_access(h);
+ auto a_acc = a.get_access (h);
+ auto b_acc = b.get_access (h);
+ auto seq_acc = seq_buf.get_access (h);
+
+ h.parallel_for(range<2>(N, N), [=](id<2> index) {
+ int i = index[0], k = index[1];
+ // This conditional block finds the maximum possible Viterbi value on
+ // the current step j for the state i.
+ if (ViterbiCondition(v_acc[k][j], b_acc[i][seq_acc[j + 1]], a_acc[k][i], v_acc[i][j + 1])) {
+ v_acc[i][j + 1] = v_acc[k][j] + a_acc[k][i] + b_acc[i][seq_acc[j + 1]];
+ b_ptr_acc[i][j + 1] = k;
+ }
+ });
+ });
+ }
+ delete[] seq;
+
+ // Getting the Viterbi path based on the matrix of back pointers
+ buffer vit_path(range<1> {T});
+ auto v_acc = viterbi.get_access();
+ auto b_ptr_acc = back_pointer.get_access();
+ auto vit_path_acc = vit_path.get_access();
+ double v_max = MIN_DOUBLE;
+ // Constructing the Viterbi path. The last state of this path is the one with
+ // the biggest Viterbi value (the most likely state).
+ for (int i = 0; i < N; ++i) {
+ if (v_acc[i][T - 1] > v_max) {
+ v_max = v_acc[i][T - 1];
+ vit_path_acc[T - 1] = i;
+ }
+ }
+
+ for (int i = T - 2; i >= 0; --i) {
+ // Every back pointer starting from the last one contains the index of the previous
+ // point in Viterbi path.
+ vit_path_acc[i] = b_ptr_acc[vit_path_acc[i + 1]][i + 1];
+ }
+
+ cout << "The Viterbi path is: "<< std::endl;
+ for (int k = 0; k < T; ++k) {
+ cout << vit_path_acc[k] << " ";
+ }
+ cout << std::endl;
+
+ } catch (sycl::exception const& e) {
+ // Exception processing
+ cout << "An exception is caught!\n";
+ cout << "Error message:" << e.what();
+ terminate();
+ }
+ cout << "The sample completed successfully!" << std::endl;
+ return 0;
+}
+
+// The method checks if all three components of the sum are not equivalent to logarithm of zero
+// (that is incorrect value and is substituted with minimal possible value of double) and that
+// the Viterbi value on the new step exceeds the current one.
+bool ViterbiCondition(double x, double y, double z, double compare) {
+ return (x > MIN_DOUBLE) && (y > MIN_DOUBLE) && (z > MIN_DOUBLE) && (x + y + z > compare);
+}
diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt
new file mode 100644
index 0000000000..85fcec4963
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/CMakeLists.txt
@@ -0,0 +1,30 @@
+# required cmake version
+cmake_minimum_required(VERSION 3.5)
+
+project (PrefixSum)
+
+if(WIN32)
+ set(CMAKE_CXX_COMPILER "dpcpp")
+else()
+ set(CMAKE_CXX_COMPILER "dpcpp")
+endif()
+
+# Set default build type to RelWithDebInfo if not specified
+if (NOT CMAKE_BUILD_TYPE)
+ message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info")
+ set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE
+ STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel"
+ FORCE)
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17")
+
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl")
+
+add_executable (PrefixSum src/PrefixSum.cpp)
+
+add_custom_target (run
+ COMMAND PrefixSum 21 47
+ WORKING_DIRECTORY ${CMAKE_PROJECT_DIR}
+)
+
diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt
new file mode 100644
index 0000000000..415025cf03
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln
new file mode 100644
index 0000000000..3587a92e74
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.sln
@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.29926.136
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "PrefixSum", "PrefixSum.vcxproj", "{BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|x64 = Debug|x64
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.ActiveCfg = Debug|x64
+ {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Debug|x64.Build.0 = Debug|x64
+ {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.ActiveCfg = Release|x64
+ {BC12ABE6-7951-47D6-93DC-126F8A5FCFD2}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ SolutionGuid = {9B9594EB-112B-4FAE-AD1F-04BD8FF34B9F}
+ EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj
new file mode 100644
index 0000000000..6a6309b96b
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj
@@ -0,0 +1,137 @@
+
+
+
+
+ Debug
+ x64
+
+
+ Release
+ x64
+
+
+
+
+
+
+ 15.0
+ {bc12abe6-7951-47d6-93dc-126f8a5fcfd2}
+ Win32Proj
+ PrefixSum
+ 10.0.17763.0
+
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true
+
+
+ true
+
+
+ false
+
+
+ false
+
+
+
+
+
+
+
+
+
+ Console
+ true
+
+
+
+
+
+
+
+
+ %ONEAPI_ROOT%\dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+
+
+
+
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+
+
+
+
+ %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories)
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters
new file mode 100644
index 0000000000..2003dce0f2
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.filters
@@ -0,0 +1,22 @@
+
+
+
+
+ {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
+ cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx
+
+
+ {93995380-89BD-4b04-88EB-625FBE52EBFB}
+ h;hh;hpp;hxx;hm;inl;inc;ipp;xsd
+
+
+ {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
+ rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
+
+
+
+
+ Source Files
+
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user
new file mode 100644
index 0000000000..7288fa06dd
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/PrefixSum.vcxproj.user
@@ -0,0 +1,11 @@
+
+
+
+ 21 47
+ WindowsLocalDebugger
+
+
+ 21 47
+ WindowsLocalDebugger
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md
new file mode 100644
index 0000000000..6bbc2cfdfb
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/README.md
@@ -0,0 +1,124 @@
+# `Prefix Sum` sample
+
+This code sample demonstrates the implementation of parallel prefix sum using Intel Data Parallel C++ to
+offload the computation to a GPU. In this implementation, a random sequence of 2**n elements is given
+(n is a positive number) as input, the algorithm compute the prefix sum in parallel. The result sequence is
+in ascending order.
+
+For comprehensive instructions regarding DPC++ Programming, go to
+https://software.intel.com/en-us/oneapi-programming-guide
+and search based on relevant terms noted in the comments.
+
+| Optimized for | Description
+|:--- |:---
+| OS | Linux Ubuntu 18.04
+| Hardware | Skylake with GEN9 or newer
+| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta)
+| What you will learn | Implement bitonic sort using Intel DPC++ compiler
+| Time to complete | 15 minutes
+
+
+## Purpose
+
+Given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and returns
+a new sequence y0, y1, y2, ..., yn so that
+
+y0 = x0
+y1 = x0 + x1
+y2 = x0 + x1 + x2
+.....
+yn = x0 + x1 + x2 + ... + xn
+
+Below is the pseudo code for computing prefix sum in parallel:
+
+n is power of 2 (1, 2, 4 , 8, 16, ...):
+
+for i from 0 to [log2 n] - 1 do
+ for j from 0 to (n-1) do in parallel
+ if j<2^i then
+ x_{j}^{i+1} <- x_{j}^{i}}
+ else
+ x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}}
+
+In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i.
+Given n processors to perform each iteration of the inner loop in constant time, the algorithm
+as a whole runs in O(log n) time, the number of iterations of the outer loop.
+
+The code will attempt first to execute on an available GPU and fallback to the system's CPU if a
+compatible GPU is not detected.
+
+## Key Implementation Details
+
+The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command
+groups.
+
+## License
+This code sample is licensed under MIT license
+
+## Building the `PrefixSum` Program for CPU and GPU
+
+### Include Files
+The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples In DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU,
+FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI
+Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/)
+
+### On a Linux* System
+1. Build the program using the following `cmake` commands.
+ ```
+ $ cd PrefixSum
+ $ mkdir build
+ $ cd build
+ $ cmake ..
+ $ make
+ ```
+
+2. Run the program:
+ ```
+ make run
+ ```
+
+3. Clean the program using:
+ ```
+ make clean
+ ```
+
+### On a Windows* System
+ * Build the program using VS2017 or VS2019
+ Right click on the solution file and open using either VS2017 or VS2019 IDE.
+ Right click on the project in Solution explorer and select Rebuild.
+ From top menu select Debug -> Start without Debugging.
+
+ * Build the program using MSBuild
+ Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for
+ VS2019"
+ Run - MSBuild PrefixSum.sln /t:Rebuild /p:Configuration="Release"
+
+## Running the sample
+### Application Parameters
+
+ Usage: PrefixSum
+
+where
+
+exponent is a positive number. The according length of the sequence is 2**exponent.
+
+seed is the seed used by the random generator to generate the randomness.
+
+The sample offloads the computation to GPU and then performs the verification the results in the CPU.
+The results are verified if yk = yk-1 + xk the original compared. If the results are matched and
+the ascending order is verified, the application will display a “Success!” message.
+
+### Example of Output
+```
+$ ./PrefixSum 21 47
+
+Sequence size: 2097152, seed: 47
+Num iteration: 21
+Device: Intel(R) Gen9 HD Graphics NEO
+Kernel time: 170 ms
+
+Success!
+```
diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json
new file mode 100644
index 0000000000..def268a2f8
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/sample.json
@@ -0,0 +1,29 @@
+{
+ "guid": "5D274319-02EE-44B0-B055-71E4C50D05E0",
+ "name": "PrefixSum",
+ "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ],
+ "description": "Compute Prefix Sum using Intel® oneAPI DPC++ Language",
+ "toolchain": [ "dpcpp" ],
+ "targetDevice": [ "CPU", "GPU" ],
+ "languages": [ { "cpp": {} } ],
+ "os": [ "linux", "windows" ],
+ "builder": [ "ide", "cmake" ],
+ "ciTests": {
+ "linux": [{
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make",
+ "make run"
+ ]
+ }],
+ "windows": [{
+ "steps": [
+ "MSBuild PrefixSum.sln /t:Rebuild /p:Configuration=\"Release\"",
+ "cd x64/Release",
+ "PrefixSum.exe 21 47"
+ ]
+ }]
+ }
+}
diff --git a/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp
new file mode 100644
index 0000000000..b2af8367a7
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/PrefixSum/src/PrefixSum.cpp
@@ -0,0 +1,239 @@
+//==============================================================
+// Copyright © 2020 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// PrefixSum: this code sample implements the inclusive scan (prefix sum) in parallel. That
+// is, given a randomized sequence of numbers x0, x1, x2, ..., xn, this algorithm computes and
+// returns a new sequence y0, y1, y2, ..., yn so that
+//
+// y0 = x0
+// y1 = x0 + x1
+// y2 = x0 + x1 + x2
+// .....
+// yn = x0 + x1 + x2 + ... + xn
+//
+// Below is the pseudo code for computing prefix sum in parallel:
+//
+// n is power of 2 (1, 2, 4 , 8, 16, ...):
+//
+// for i from 0 to [log2 n] - 1 do
+// for j from 0 to (n-1) do in parallel
+// if j<2^i then
+// x_{j}^{i+1} <- x_{j}^{i}}
+// else
+// x_{j}^{i+1} <- x_{j}^{i} + x_{j-2^{i}}^{i}}
+//
+// In the above, the notation x_{j}^{i} means the value of the jth element of array x in timestep i.
+// Given n processors to perform each iteration of the inner loop in constant time, the algorithm as
+// a whole runs in O(log n) time, the number of iterations of the outer loop.
+//
+
+#include
+
+// dpc_common.hpp can be found in the dev-utilities include folder.
+// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp
+#include "dpc_common.hpp"
+
+using namespace sycl;
+using namespace std;
+
+void Show(int a[], int arraysize)
+{
+ for (int i = 0; i < arraysize; ++i)
+ {
+ std::cout << a[i] << " ";
+ if ((i % 16) == 15) std::cout << "\n";
+ }
+
+ std::cout << "\n";
+ return;
+}
+
+int* ParallelPrefixSum(int* prefix1, int* prefix2, unsigned int nb, queue &q)
+{
+ unsigned int two_power = 1;
+ unsigned int num_iter = log2(nb);
+ //unsigned int uintmax = UINT_MAX;
+ int* result = NULL;
+
+ // std::cout << "uintmax " << uintmax << " " << log2(uintmax) << "\n";
+ // Buffer scope
+ {
+ buffer prefix1_buf(prefix1, range<1>{nb});
+ buffer prefix2_buf(prefix2, range<1>{nb});
+
+ // Iterate over the necessary iterations.
+ for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) {
+
+ // Submit command group for execution
+ q.submit([&](handler& h) {
+ // Create accessors
+ auto prefix1_acc = prefix1_buf.get_access(h);
+ auto prefix2_acc = prefix2_buf.get_access(h);
+
+ if (iter % 2 == 0) {
+ h.parallel_for(range<1>(nb), [=](id<1> j) {
+ if (j < two_power) {
+ prefix2_acc[j] = prefix1_acc[j];
+ }
+ else {
+ prefix2_acc[j] = prefix1_acc[j] + prefix1_acc[j - two_power];
+ }
+ }); // end parallel for loop in kernel
+ result = prefix2;
+ //std::cout << "return prefix2\n";
+ }
+ else {
+ h.parallel_for(range<1>(nb), [=](id<1> j) {
+ if (j < two_power) {
+ prefix1_acc[j] = prefix2_acc[j];
+ }
+ else {
+ prefix1_acc[j] = prefix2_acc[j] + prefix2_acc[j - two_power];
+ }
+ }); // end parallel for loop in kernel
+ result = prefix1;
+ //std::cout << "return prefix1\n";
+ }
+ }); // end device queue
+ } // end iteration
+ } // Buffer scope
+
+ // Wait for commands to complete. Enforce synchronization on the command queue
+ q.wait_and_throw();
+
+ return result;
+}
+/*
+void PrefixSum(int* x, unsigned int nb)
+{
+ unsigned int two_power = 1;
+ unsigned int num_iter = log2(nb);
+ int temp = 0;
+
+ // Iterate over the necessary iterations
+ for (unsigned int iter = 0; iter < num_iter; iter++, two_power*=2) {
+ //Show(x, nb);
+ // std::cout << "two_power: " << two_power << "\n";
+ for (unsigned int j = nb; j > 0; j--) {
+ if (j < two_power) {
+ x[j] = x[j];
+ }
+ else {
+ x[j] = x[j] + x[j - two_power];
+ }
+ }
+ }
+}
+*/
+void Usage(std::string prog_name, int exponent) {
+ std::cout << " Incorrect parameters\n";
+ std::cout << " Usage: " << prog_name << " n k \n\n";
+ std::cout << " n: Integer exponent presenting the size of the input array. The number of el\
+ement in\n";
+ std::cout << " the array must be power of 2 (e.g., 1, 2, 4, ...). Please enter the corre\
+sponding\n";
+ std::cout << " exponent betwwen 0 and " << exponent - 1 << ".\n";
+ std::cout << " k: Seed used to generate a random sequence.\n";
+}
+
+int main(int argc, char* argv[]) {
+ unsigned int nb, seed;
+ int n, exp_max = log2(std::numeric_limits::max());
+
+ // Read parameters.
+ try {
+ n = std::stoi(argv[1]);
+
+ // Verify the boundary of acceptance.
+ if (n < 0 || n >= exp_max) {
+ Usage(argv[0], exp_max);
+ return -1;
+ }
+
+ seed = std::stoi(argv[2]);
+ nb = pow(2, n);
+ } catch (...) {
+ Usage(argv[0], exp_max);
+ return -1;
+ }
+
+ std::cout << "\nSequence size: " << nb << ", seed: " << seed;
+
+ int num_iter = log2(nb);
+ std::cout << "\nNum iteration: " << num_iter << "\n";
+
+ // Define device selector as 'default'
+ default_selector device_selector;
+
+ // exception handler
+ auto exception_handler = [](exception_list exceptionList) {
+ for (std::exception_ptr const& e : exceptionList) {
+ try {
+ std::rethrow_exception(e);
+ } catch (cl::sycl::exception const& e) {
+ std::terminate();
+ }
+ }
+ };
+
+ // Create a device queue using DPC++ class queue
+ queue q(device_selector, exception_handler);
+
+ std::cout << "Device: " << q.get_device().get_info() << "\n";
+
+ int *data = new int[nb];
+ int *prefix_sum1 = new int[nb];
+ int *prefix_sum2 = new int[nb];
+ int *result = NULL;
+
+ srand(seed);
+
+ // Initialize data arrays
+ for (int i = 0; i < nb; i++) {
+ data[i] = prefix_sum1[i] = rand() % 10;
+ prefix_sum2[i] = 0;
+ }
+
+ // Start timer
+ auto start = std::chrono::steady_clock::now();
+
+ result = ParallelPrefixSum(prefix_sum1, prefix_sum2, nb, q);
+
+ auto end = std::chrono::steady_clock::now();
+ auto timeKern = std::chrono::duration_cast(end - start).count();
+ std::cout << "Kernel time: " << timeKern << " ms" << "\n";
+
+ //std::cout << "\ndata after transforming using parallel prefix sum result:";
+ //Show(result, nb);
+
+ bool equal = true;
+
+ if (result[0] != data[0])
+ equal = false;
+ else
+ {
+ for (int i = 1; i < nb; i++) {
+ if (result[i] != result[i - 1] + data[i])
+ {
+ equal = false;
+ break;
+ }
+ }
+ }
+
+ delete[] data;
+ delete[] prefix_sum1;
+ delete[] prefix_sum2;
+
+ if (!equal) {
+ std::cout << "\nFailed: " << std::endl;
+ return -2;
+ }
+ else {
+ std::cout << "\nSuccess!" << std::endl;
+ return 0;
+ }
+}
diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt
new file mode 100644
index 0000000000..f472928505
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+# Set default build type to RelWithDebInfo if not specified
+if (NOT CMAKE_BUILD_TYPE)
+ message (STATUS "Default CMAKE_BUILD_TYPE not set using Release")
+ set (CMAKE_BUILD_TYPE "Release" CACHE
+ STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel"
+ FORCE)
+endif()
+
+cmake_minimum_required (VERSION 3.0)
+project(dpc_reduce LANGUAGES CXX)
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/License.txt b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/License.txt
new file mode 100644
index 0000000000..9cde07f558
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/License.txt
@@ -0,0 +1,8 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md
new file mode 100644
index 0000000000..7a08d01177
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md
@@ -0,0 +1,76 @@
+# dpc_reduce Sample
+
+The dpc_reduce is a simple program that calculates pi. This program is implemented using C++ and Data Parallel C++ (DPC++) for Intel(R) CPU and accelerators.
+
+
+For comprehensive instructions regarding DPC++ Programming, go to https://software.intel.com/en-us/oneapi-programming-guide and search based on relevant terms noted in the comments.
+
+| Optimized for | Description
+| OS | Linux* Ubuntu* 18.04,
+| Hardware | Skylake with GEN9 or newer,
+| Software | Intel® oneAPI DPC++ Compiler (beta)
+| What you will learn | how to perform reduction with oneAPI on cpu and gpu
+| Time to complete | 30 min
+
+## Purpose
+This example demonstrates how to do reduction by using the CPU in serial mode,
+the CPU in parallel mode (using TBB), the GPU using direct DPC++ coding, the
+GPU using multiple steps with DPC++ Library algorithms transform and reduce,
+and then finally using the DPC++ Library transform_reduce algorithm.
+
+All the different modes use a simple calculation for Pi. It is a well known
+mathematical formula that if you integrate from 0 to 1 over the function,
+(4.0 / (1+x*x) )dx the answer is pi. One can approximate this integral
+by summing up the area of a large number of rectangles over this same range.
+
+Each of the different function calculates pi by breaking the range into many
+tiny rectangles and then summing up the results.
+
+The parallel computations are performed using oneTBB and oneAPI DPC++ library
+(oneDPL).
+
+## Key Implementation Details
+The basic DPC++ implementation explained in the code includes accessor,
+kernels, queues, buffers as well as some oneDPL library calls.
+
+## License
+This code sample is licensed under MIT license.
+
+## Building the dpc_reduce program for CPU and GPU
+
+### Include Files
+The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system".
+
+### Running Samples In DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/)
+
+### On a Linux* System
+Perform the following steps:
+1. Build the program using the following 'cmake' commands
+mkdir build
+cd build
+cmake ..
+make
+
+2. Run the program using:
+make run or src/dpc_reduce
+
+3. Clean the program using:
+make clean
+
+
+## Running the Sample
+### Application Parameters
+There are no editable parameters for this sample.
+
+### Example of Output
+Number of steps is 1000000
+Cpu Seq calc: PI =3.14 in 0.00348 seconds
+Cpu TBB calc: PI =3.14 in 0.00178 seconds
+dpstd native: PI =3.14 in 0.191 seconds
+dpstd native2: PI =3.14 in 0.142 seconds
+dpstd native3: PI =3.14 in 0.002 seconds
+dpstd native4: PI =3.14 in 0.00234 seconds
+dpstd two steps: PI =3.14 in 0.00138 seconds
+dpstd transform_reduce: PI =3.14 in 0.000442 seconds
+success
diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json
new file mode 100644
index 0000000000..b8c2f8cb72
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json
@@ -0,0 +1,29 @@
+ {
+ "guid": "ECF6C8EB-753B-4107-AF64-60662CE41726",
+ "name": "DPC Reduce",
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/oneAPI DPC++ Library/CPU and GPU"],
+ "description": "It models transform reduce in different ways showing capability of oneAPI.",
+ "toolchain": ["dpcpp"],
+ "languages": [{
+ "cpp": {}
+ }],
+ "targetDevice": ["CPU", "GPU"],
+ "os": ["linux"],
+ "builder": ["cmake"],
+ "ciTests": {
+ "linux": [
+ {
+ "id": "dpc_reduce",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make",
+ "./src/dpc_reduce"
+ ]
+ }
+ ]
+ }
+}
+
+
diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/CMakeLists.txt
new file mode 100644
index 0000000000..cc3703162b
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/CMakeLists.txt
@@ -0,0 +1,24 @@
+if (NOT CMAKE_CXX_STANDARD)
+ set(CMAKE_CXX_STANDARD 14)
+endif()
+
+if (NOT CMAKE_BUILD_TYPE)
+ set(CMAKE_BUILD_TYPE RelWithDebInfo)
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -ltbb")
+
+# Add an executable target from source files
+add_executable(${PROJECT_NAME} main.cpp)
+
+if(WIN32)
+ # Specify libraries to link with
+ target_link_libraries(${PROJECT_NAME} sycl )
+
+ # Add custom target for running
+ add_custom_target(run ${PROJECT_NAME}.exe)
+else()
+ # Add custom target for running
+ add_custom_target(run ./${PROJECT_NAME})
+endif()
diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp
new file mode 100644
index 0000000000..25cf767a49
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp
@@ -0,0 +1,519 @@
+//==============================================================
+// Copyright © 2020 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include
+#include // setprecision library
+#include
+#include
+#include
+#include
+#include "dpc_common.hpp"
+// Many oneAPI code samples share common include files. These
+// include files are installed locally with the product installation
+// and can be located at %ONEAPI_ROOT%\dev-utilities\latest\include
+// on your development system.
+
+using namespace sycl;
+
+// cpu_seq is a simple sequential CPU routine
+// that calculates all the slices and then
+// does a reduction.
+float calc_pi_cpu_seq(int num_steps) {
+ float step = 1.0 / (float)num_steps;
+ float x;
+ float sum = 0.0;
+ for (int i = 1; i < num_steps; i++) {
+ x = (i - 0.5) * step;
+ sum = sum + 4.0 / (1.0 + x * x);
+ }
+ return sum / (float)num_steps;
+}
+
+// cpu_tbb is a simple parallel_reduce tbb routine
+// that calculates all the slices and then
+// uses tbb reduce to combine results.
+float calc_pi_cpu_tbb(int num_steps) {
+ float step = 1.0 / (float)num_steps;
+
+ auto tbbtotal =
+ tbb::parallel_reduce(tbb::blocked_range(1, num_steps), 0.0,
+ [=](tbb::blocked_range r, float running_total) {
+ float y;
+ for (int i = r.begin(); i != r.end(); i++) {
+ y = (i - 0.5) * step;
+ running_total += 4.0 / (1.0 + y * y);
+ }
+ return running_total;
+ },
+ std::plus());
+ return tbbtotal / (float)num_steps;
+}
+
+// dpstd_native uses a parallel_for to fill
+// a buffer with all the slice calculations and
+// then uses a single_task to combine all the results
+// This is not the highest performing example but shows
+// how to do calculations directly in dpc++ with
+// mininmal complexity.
+template
+float calc_pi_dpstd_native(size_t num_steps, Policy&& policy) {
+ float step = 1.0 / (float)num_steps;
+
+ float data[num_steps];
+
+ // Create buffer using host allocated "data" array
+ buffer buf{data, range<1>{num_steps}};
+
+ policy.queue().submit([&](handler& h) {
+ auto writeresult = buf.get_access(h);
+ h.parallel_for(range<1>{num_steps}, [=](id<1> idx) {
+ float x = ((float)idx[0] - 0.5) / (float)num_steps;
+ writeresult[idx[0]] = 4.0f / (1.0 + x * x);
+ });
+ });
+ policy.queue().wait();
+
+ // Single task is needed here to make sure
+ // data is not written over.
+ policy.queue().submit([&](handler& h) {
+ auto a = buf.get_access(h);
+ h.single_task([=]() {
+ for (int i = 1; i < num_steps; i++) a[0] += a[i];
+ });
+ });
+ policy.queue().wait();
+
+ float mynewresult =
+ buf.get_access()[0] / (float)num_steps;
+ return mynewresult;
+}
+
+// This option uses a parallel for to fill the array, and then use a single
+// task to reduce into groups and then use cpu for final reduction.
+template
+float calc_pi_dpstd_native2(size_t num_steps, Policy&& policy, int group_size) {
+ float step = 1.0 / (float)num_steps;
+
+ float data[num_steps];
+ float myresult = 0.0;
+
+ // Create buffer using host allocated "data" array
+ buffer buf{data, range<1>{num_steps}};
+
+ // fill buffer with calculations
+ policy.queue().submit([&](handler& h) {
+ auto writeresult = buf.get_access(h);
+ h.parallel_for(range<1>{num_steps}, [=](id<1> idx) {
+ float x = ((float)idx[0] - 0.5) / (float)num_steps;
+ writeresult[idx[0]] = 4.0f / (1.0 + x * x);
+ });
+ });
+ policy.queue().wait();
+
+ size_t num_groups = num_steps / group_size;
+ float c[num_groups];
+ // create a number of groups and do a local reduction
+ // within these groups using single_task. Store each
+ // result within the output of bufc
+ for (int i = 0; i < num_groups; i++) c[i] = 0;
+ buffer bufc{c, range<1>{num_groups}};
+ for (int j = 0; j < num_groups; j++) {
+ policy.queue().submit([&](handler& h) {
+ auto my_a = buf.get_access(h);
+ auto my_c = bufc.get_access(h);
+ h.single_task([=]() {
+ for (int i = 0 + group_size * j; i < group_size + group_size * j; i++)
+ my_c[j] += my_a[i];
+ });
+ });
+ }
+ policy.queue().wait();
+
+ auto src = bufc.get_access();
+
+ // Sum up results on CPU
+ float mynewresult = 0.0;
+ for (int i = 0; i < num_groups; i++) mynewresult += src[i];
+
+ return mynewresult / (float)num_steps;
+}
+
+// Function operator used as transform operation in transform-reduce operations
+// implemented below.
+struct my_no_op {
+ template
+ Tp&& operator()(Tp&& a) const {
+ return std::forward(a);
+ }
+};
+
+// Structure slice area performs the calculations for
+// each rectangle that will be summed up.
+struct slice_area {
+ int num;
+ slice_area(int num_steps) { num = num_steps; }
+
+ template
+ float operator()(T&& i) {
+ float x = ((float)i - 0.5) / (float)num;
+ return 4.0f / (1.0f + (x * x));
+ };
+};
+
+// This option uses a parallel for to fill the buffer and then
+// uses a tranform_init with plus/no_op and then
+// a local reduction then global reduction.
+template
+float calc_pi_dpstd_native3(size_t num_steps, int groups, Policy&& policy) {
+ float data[num_steps];
+
+ // Create buffer using host allocated "data" array
+ buffer buf{data, range<1>{num_steps}};
+
+ // fill the buffer with the calculation using parallel for
+ policy.queue().submit([&](handler& h) {
+ auto writeresult = buf.get_access(h);
+ h.parallel_for(range<1>{num_steps}, [=](id<1> idx) {
+ float x = (float)idx[0] / (float)num_steps;
+ writeresult[idx[0]] = 4.0f / (1.0f + x * x);
+ });
+ });
+ policy.queue().wait();
+
+ // Calc_begin and calc_end are iterators pointing to
+ // beginning and end of the buffer
+ auto calc_begin = oneapi::dpl::begin(buf);
+ auto calc_end = oneapi::dpl::end(buf);
+
+ using Functor = oneapi::dpl::unseq_backend::walk_n;
+ float result;
+
+ // Functor will do nothing for tranform_init and will use plus for reduce.
+ // In this example we have done the calculation and filled the buffer above
+ // The way transform_init works is that you need to have the value already
+ // populated in the buffer.
+ auto tf_init =
+ oneapi::dpl::unseq_backend::transform_init,
+ Functor>{std::plus(),
+ Functor{my_no_op()}};
+
+ auto combine = std::plus();
+ auto brick_reduce =
+ oneapi::dpl::unseq_backend::reduce, float>{
+ std::plus()};
+ auto workgroup_size =
+ policy.queue()
+ .get_device()
+ .template get_info();
+ auto max_comp_u = policy.queue()
+ .get_device()
+ .template get_info();
+ auto n_groups = (num_steps - 1) / workgroup_size + 1;
+ n_groups =
+ std::min(decltype(n_groups)(max_comp_u),
+ n_groups); // make groups max number of compute units or less
+
+ // 0. Create temporary global buffer to store temporary value
+ auto temp_buf = buffer(range<1>(n_groups));
+ // 1. Reduce over each work_group
+ auto local_reduce_event =
+ policy.queue().submit([&buf, &temp_buf, &brick_reduce, &tf_init,
+ num_steps, n_groups, workgroup_size](handler& h) {
+ auto access_buf = buf.template get_access(h);
+ auto temp_acc =
+ temp_buf.template get_access(h);
+ // Create temporary local buffer
+ accessor
+ temp_buf_local(range<1>(workgroup_size), h);
+ h.parallel_for(nd_range<1>(range<1>(n_groups * workgroup_size),
+ range<1>(workgroup_size)),
+ [=](nd_item<1> item_id) mutable {
+ auto global_idx = item_id.get_global_id(0);
+ // 1. Initialization (transform part).
+ tf_init(item_id, global_idx, access_buf, num_steps,
+ temp_buf_local);
+ // 2. Reduce within work group
+ float local_result = brick_reduce(
+ item_id, global_idx, num_steps, temp_buf_local);
+ if (item_id.get_local_id(0) == 0) {
+ temp_acc[item_id.get_group(0)] = local_result;
+ }
+ });
+ });
+
+ // 2. global reduction
+ auto reduce_event = local_reduce_event;
+ if (n_groups > 1) {
+ auto countby2 = decltype(n_groups)(1);
+ do {
+ reduce_event = policy.queue().submit([&reduce_event, &temp_buf, &combine,
+ countby2, n_groups](handler& h) {
+ h.depends_on(reduce_event);
+ auto temp_acc =
+ temp_buf.template get_access(h);
+ h.parallel_for(range<1>(n_groups), [=](item<1> item_id) mutable {
+ auto global_idx = item_id.get_linear_id();
+
+ if (global_idx % (2 * countby2) == 0 &&
+ global_idx + countby2 < n_groups) {
+ temp_acc[global_idx] =
+ combine(temp_acc[global_idx], temp_acc[global_idx + countby2]);
+ }
+ });
+ });
+ countby2 *= 2;
+ } while (countby2 < n_groups);
+ }
+
+ float answer = temp_buf.template get_access()[0];
+ result = answer / (float)num_steps;
+ return result;
+}
+
+// dpstd_native4 fills a buffer with number 1...num_steps and then
+// calls transform_init to calculate the slices and then
+// does a reduction in two steps - global and then local.
+template
+float calc_pi_dpstd_native4(size_t num_steps, int groups, Policy&& policy) {
+ std::vector data(num_steps);
+ float result = 0.0;
+
+ buffer buf2{data.data(), range<1>{num_steps}};
+
+ // fill buffer with 1...num_steps
+ policy.queue().submit([&](handler& h) {
+ auto writeresult = buf2.get_access(h);
+ h.parallel_for(range<1>{num_steps},
+ [=](id<1> idx) { writeresult[idx[0]] = (float)idx[0]; });
+ });
+ policy.queue().wait();
+
+ auto calc_begin = oneapi::dpl::begin(buf2);
+ auto calc_end = oneapi::dpl::end(buf2);
+
+ using Functor2 = oneapi::dpl::unseq_backend::walk_n;
+
+ // The buffer has 1...num it at and now we will use that as an input
+ // to the slice structue which will calculate the area of each
+ // rectangle.
+ auto tf_init =
+ oneapi::dpl::unseq_backend::transform_init,
+ Functor2>{
+ std::plus(), Functor2{slice_area(num_steps)}};
+
+ auto combine = std::plus();
+ auto brick_reduce =
+ oneapi::dpl::unseq_backend::reduce, float>{
+ std::plus()};
+
+ // get workgroup_size from the device
+ auto workgroup_size =
+ policy.queue()
+ .get_device()
+ .template get_info();
+
+ // get number of compute units from device.
+ auto max_comp_u = policy.queue()
+ .get_device()
+ .template get_info();
+
+ auto n_groups = (num_steps - 1) / workgroup_size + 1;
+
+ // use the smaller of the number of workgroups device has or the
+ // number of steps/workgroups
+ n_groups = std::min(decltype(n_groups)(max_comp_u), n_groups);
+
+ // Create temporary global buffer to store temporary value
+ auto temp_buf = buffer(range<1>(n_groups));
+
+ // Reduce over each work_group
+ auto local_reduce_event =
+ policy.queue().submit([&buf2, &temp_buf, &brick_reduce, &tf_init,
+ num_steps, n_groups, workgroup_size](handler& h) {
+ // grab access to the previous input
+ auto access_buf = buf2.template get_access(h);
+ auto temp_acc =
+ temp_buf.template get_access(h);
+ // Create temporary local buffer
+ accessor
+ temp_buf_local(range<1>(workgroup_size), h);
+ h.parallel_for(nd_range<1>(range<1>(n_groups * workgroup_size),
+ range<1>(workgroup_size)),
+ [=](nd_item<1> item_id) mutable {
+ auto global_idx = item_id.get_global_id(0);
+ // 1. Initialization (transform part). Fill local
+ // memory
+ tf_init(item_id, global_idx, access_buf, num_steps,
+ temp_buf_local);
+ // 2. Reduce within work group
+ float local_result = brick_reduce(
+ item_id, global_idx, num_steps, temp_buf_local);
+ if (item_id.get_local_id(0) == 0) {
+ temp_acc[item_id.get_group(0)] = local_result;
+ }
+ });
+ });
+
+ // global reduction
+ auto reduce_event = local_reduce_event;
+ if (n_groups > 1) {
+ auto countby2 = decltype(n_groups)(1);
+ do {
+ reduce_event = policy.queue().submit([&reduce_event, &temp_buf, &combine,
+ countby2, n_groups](handler& h) {
+ h.depends_on(reduce_event);
+ auto temp_acc =
+ temp_buf.template get_access(h);
+ h.parallel_for(range<1>(n_groups), [=](item<1> item_id) mutable {
+ auto global_idx = item_id.get_linear_id();
+
+ if (global_idx % (2 * countby2) == 0 &&
+ global_idx + countby2 < n_groups) {
+ temp_acc[global_idx] =
+ combine(temp_acc[global_idx], temp_acc[global_idx + countby2]);
+ }
+ });
+ });
+ countby2 *= 2;
+ } while (countby2 < n_groups);
+ }
+ float answer = temp_buf.template get_access()[0];
+ result = answer / (float)num_steps;
+
+ return result;
+}
+
+// This function shows the use of two different DPC++ library calls.
+// The first is a transform calls which will fill a buff with the
+// calculations of each small rectangle. The second call is the reduce
+// call which sums up the results of all the elements in the buffer.
+template
+float calc_pi_dpstd_two_steps_lib(int num_steps, Policy&& policy) {
+ float step = 1.0 / (float)num_steps;
+
+ buffer calc_values{num_steps};
+ auto calc_begin2 = oneapi::dpl::begin(calc_values);
+ auto calc_end2 = oneapi::dpl::end(calc_values);
+
+ // use DPC++ library call transform to fill the buffer with
+ // the area calculations for each rectangle.
+ std::transform(policy, oneapi::dpl::counting_iterator(1),
+ oneapi::dpl::counting_iterator(num_steps), calc_begin2,
+ [=](int i) {
+ float x = (((float)i - 0.5f) / (float)(num_steps));
+ return (4.0f / (1.0f + x * x));
+ });
+
+ policy.queue().wait();
+
+ // use the DPC++ library call to reduce the array using plus
+ float result =
+ std::reduce(policy, calc_begin2, calc_end2, 0.0f, std::plus());
+ policy.queue().wait();
+
+ result = result / (float)num_steps;
+
+ return result;
+}
+
+// This function uses the DPC++ library call
+// transform reduce. It does everything in one library
+// call.
+template
+float calc_pi_dpstd_onestep(int num_steps, Policy& policy) {
+ float step = 1.0f / (float)num_steps;
+
+ float total = std::transform_reduce(
+ policy, oneapi::dpl::counting_iterator(1),
+ oneapi::dpl::counting_iterator(num_steps), 0.0f, std::plus(),
+ [=](int i) {
+ float x = (float)(((float)i - 0.5f) / (float(num_steps)));
+ return (4.0f / (1.0f + x * x));
+ });
+ total = total * (float)step;
+
+ return total;
+}
+
+int main(int argc, char** argv) {
+ int num_steps = 1000000;
+ printf("Number of steps is %d\n", num_steps);
+ int groups = 10000;
+
+ float pi;
+ queue myQueue{property::queue::in_order()};
+ auto policy = oneapi::dpl::execution::make_device_policy(
+ queue(default_selector{}, dpc_common::exception_handler));
+
+ // Since we are using JIT compiler for samples,
+ // we need to run each step once to allow for compile
+ // to occur before we time execution of function.
+ pi = calc_pi_dpstd_native(num_steps, policy);
+ pi = calc_pi_dpstd_native2(num_steps, policy, groups);
+ pi = calc_pi_dpstd_native3(num_steps, groups, policy);
+ pi = calc_pi_dpstd_native4(num_steps, groups, policy);
+
+ pi = calc_pi_dpstd_two_steps_lib(num_steps, policy);
+ pi = calc_pi_dpstd_onestep(num_steps, policy);
+
+ dpc_common::TimeInterval T;
+ pi = calc_pi_cpu_seq(num_steps);
+ auto stop = T.Elapsed();
+ std::cout << "Cpu Seq calc: \t\t";
+ std::cout << std::setprecision(3) << "PI =" << pi;
+ std::cout << " in " << stop << " seconds\n";
+
+ dpc_common::TimeInterval T2;
+ pi = calc_pi_cpu_tbb(num_steps);
+ auto stop2 = T2.Elapsed();
+ std::cout << "Cpu TBB calc: \t\t";
+ std::cout << std::setprecision(3) << "PI =" << pi;
+ std::cout << " in " << stop2 << " seconds\n";
+
+ dpc_common::TimeInterval T3;
+ pi = calc_pi_dpstd_native(num_steps, policy);
+ auto stop3 = T3.Elapsed();
+ std::cout << "dpstd native:\t\t";
+ std::cout << std::setprecision(3) << "PI =" << pi;
+ std::cout << " in " << stop3 << " seconds\n";
+
+ dpc_common::TimeInterval T3a;
+ pi = calc_pi_dpstd_native2(num_steps, policy, groups);
+ auto stop3a = T3a.Elapsed();
+ std::cout << "dpstd native2:\t\t";
+ std::cout << std::setprecision(3) << "PI =" << pi;
+ std::cout << " in " << stop3a << " seconds\n";
+
+ dpc_common::TimeInterval T3b;
+ pi = calc_pi_dpstd_native3(num_steps, groups, policy);
+ auto stop3b = T3b.Elapsed();
+ std::cout << "dpstd native3:\t\t";
+ std::cout << std::setprecision(3) << "PI =" << pi;
+ std::cout << " in " << stop3b << " seconds\n";
+
+ dpc_common::TimeInterval T3c;
+ pi = calc_pi_dpstd_native4(num_steps, groups, policy);
+ auto stop3c = T3c.Elapsed();
+ std::cout << "dpstd native4:\t\t";
+ std::cout << std::setprecision(3) << "PI =" << pi;
+ std::cout << " in " << stop3c << " seconds\n";
+
+ dpc_common::TimeInterval T4;
+ pi = calc_pi_dpstd_two_steps_lib(num_steps, policy);
+ auto stop4 = T4.Elapsed();
+ std::cout << "dpstd two steps:\t";
+ std::cout << std::setprecision(3) << "PI =" << pi;
+ std::cout << " in " << stop4 << " seconds\n";
+
+ dpc_common::TimeInterval T5;
+ pi = calc_pi_dpstd_onestep(num_steps, policy);
+ auto stop5 = T5.Elapsed();
+ std::cout << "dpstd transform_reduce: ";
+ std::cout << std::setprecision(3) << "PI =" << pi;
+ std::cout << " in " << stop5 << " seconds\n";
+
+ std::cout << "success\n";
+ return 0;
+}
diff --git a/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/CMakeLists.txt
new file mode 100644
index 0000000000..069c03849e
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(CMAKE_CXX_COMPILER "icpx")
+# Set default build type to RelWithDebInfo if not specified
+if (NOT CMAKE_BUILD_TYPE)
+ message (STATUS "Default CMAKE_BUILD_TYPE not set using Release")
+ set (CMAKE_BUILD_TYPE "Release" CACHE
+ STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel"
+ FORCE)
+endif()
+
+cmake_minimum_required (VERSION 3.0)
+project(openmp_reduction LANGUAGES CXX)
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/License.txt b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/License.txt
new file mode 100644
index 0000000000..9cde07f558
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/License.txt
@@ -0,0 +1,8 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
diff --git a/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/README.md b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/README.md
new file mode 100644
index 0000000000..3836e7fc0e
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/README.md
@@ -0,0 +1,67 @@
+# openmp_reduction Sample
+
+The openmp_reduction sample is a simple program that calculates pi. This program is implemented using C++ and openMP for Intel(R) CPU and accelerators.
+
+For comprehensive instructions regarding DPC++ Programming, go to https://software.intel.com/en-us/oneapi-programming-guide and search based on relevant terms noted in the comments.
+
+| Optimized for | Description
+| OS | Linux* Ubuntu* 18.04,
+| Hardware | Skylake with GEN9 or newer
+| Software | Intel® oneAPI DPC++ Compiler (beta)
+| What you will learn | How to run openMP on cpu as well as GPU offload
+| Time to complete | 10 min
+
+## Purpose
+This example demonstrates how to do reduction by using the CPU in serial mode,
+the CPU in parallel mode (using openMP), the GPU using openMP offloading.
+
+All the different modes use a simple calculation for Pi. It is a well known
+mathematical formula that if you integrate from 0 to 1 over the function,
+(4.0 / (1+x*x) )dx the answer is pi. One can approximate this integral
+by summing up the area of a large number of rectangles over this same range.
+
+Each of the different functions calculates pi by breaking the range into many
+tiny rectangles and then summing up the results.
+
+## Key Implementation Details
+This code shows how to use OpenMP on the CPU host as well as using target offload capabilities.
+
+## License
+This code sample is licensed under MIT license.
+
+## Building the dpc_reduce program for CPU and GPU
+
+### Include Files
+The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system".
+
+### Running Samples In DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/)
+
+### On a Linux* System
+Perform the following steps:
+
+mkdir build
+cd build
+cmake ..
+
+1. Build the program using the following make commands
+make
+
+2. Run the program using:
+make run or src/openmp_reduction
+
+3. Clean the program using:
+make clean
+
+
+## Running the Sample
+
+### Application Parameters
+There are no editable parameters for this sample.
+
+### Example of Output (result vary depending on hardware)
+Number of steps is 1000000
+Cpu Seq calc: PI =3.14 in 0.00105 seconds
+Host OpenMP: PI =3.14 in 0.0010 seconds
+Offload OpenMP: PI =3.14 in 0.0005 seconds
+success
diff --git a/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/sample.json b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/sample.json
new file mode 100644
index 0000000000..78b550e82c
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/sample.json
@@ -0,0 +1,29 @@
+ {
+ "guid": "ECF6C8EB-753B-4107-AF64-60662CE41726",
+ "name": "DPC Reduce",
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/oneAPI DPC++ Library/CPU and GPU"],
+ "description": "It models transform reduce in different ways showing capability of oneAPI.",
+ "toolchain": ["dpcpp"],
+ "languages": [{
+ "cpp": {}
+ }],
+ "targetDevice": ["CPU", "GPU"],
+ "os": ["linux"],
+ "builder": ["cmake"],
+ "ciTests": {
+ "linux": [
+ {
+ "id": "dpc_reduce",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make ",
+ "./src/openmp_reduction"
+ ]
+ }
+ ]
+ }
+}
+
+
diff --git a/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/src/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/src/CMakeLists.txt
new file mode 100644
index 0000000000..90721a5f66
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/src/CMakeLists.txt
@@ -0,0 +1,24 @@
+if (NOT CMAKE_CXX_STANDARD)
+ set(CMAKE_CXX_STANDARD 14)
+endif()
+
+if (NOT CMAKE_BUILD_TYPE)
+ set(CMAKE_BUILD_TYPE RelWithDebInfo)
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fiopenmp -fopenmp-targets=spir64 -fsycl")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
+
+# Add an executable target from source files
+add_executable(${PROJECT_NAME} main.cpp)
+
+if(WIN32)
+ # Specify libraries to link with
+ target_link_libraries(${PROJECT_NAME} sycl )
+
+ # Add custom target for running
+ add_custom_target(run ${PROJECT_NAME}.exe)
+else()
+ # Add custom target for running
+ add_custom_target(run ./${PROJECT_NAME})
+endif()
diff --git a/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/src/main.cpp b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/src/main.cpp
new file mode 100644
index 0000000000..b36aae7ab5
--- /dev/null
+++ b/DirectProgramming/DPC++/ParallelPatterns/openmp_reduction/src/main.cpp
@@ -0,0 +1,106 @@
+//==============================================================
+// Copyright © 2020 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include // setprecision library
+#include
+#include "dpc_common.hpp"
+// Many oneAPI code samples share common include files. These
+// include files are installed locally with the product installation
+// and can be located at %ONEAPI_ROOT%\dev-utilities\latest\include
+// on your development system.
+
+
+// cpu_seq_calc_pi is a simple sequential CPU routine
+// that calculates all the slices and then
+// does a reduction.
+float cpu_seq_calc_pi(int num_steps) {
+ float step = 1.0 / (float)num_steps;
+ float x;
+ float pi;
+ float sum = 0.0;
+ for (int i = 1; i < num_steps; i++) {
+ x = ((float)i - 0.5f) * step;
+ sum = sum + 4.0f / (1.0f + x * x);
+ }
+ pi = sum * step;
+ return pi;
+}
+
+// openmp_host_calc_pi is a simple parallel
+// calcuation that uses openmp running
+// on the host. By default openmp
+// will use all the cores available
+// and execute the code in parallel and
+// then perform a reduction.
+float openmp_host_calc_pi(int num_steps) {
+ float step = (1.0f / num_steps);
+ float pi = 0.0;
+ float sum = 0.0;
+#pragma omp parallel for reduction(+ : sum)
+ for (int i = 1; i < num_steps; i++) {
+ float x = ((float)i - 0.5f) * step;
+ sum = sum + 4.0f / (1.0f + x * x);
+ }
+ pi = step * sum;
+ return pi;
+}
+
+// openmp_device_calc_pi is a simple parallel
+// calcuation that uses openmp running
+// on the device through the use of the
+// target specifier.
+// This will execute the code in parallel.
+
+float openmp_device_calc_pi(int num_steps) {
+ float pi = 0.0;
+ float step = (1.0f / num_steps);
+ float sum = 0.0;
+#pragma omp target teams distribute parallel for reduction(+ : sum)
+ for (int i = 1; i < num_steps; i++) {
+ float x = ((float)i - 0.5f) * step;
+ sum = sum + 4.0f / (1.0 + x * x);
+ }
+ pi = sum * step;
+ return pi;
+}
+
+int main(int argc, char** argv) {
+ int num_steps = 1000000;
+ printf("Number of steps is %d\n", num_steps);
+ float pi;
+
+ // Due to the overhead associated with
+ // JIT, run the offload calculation once
+ // that allows code to be compiled. Execution
+ // time is measured the 2nd time you run it.
+ pi = openmp_device_calc_pi(num_steps);
+
+ dpc_common::TimeInterval T;
+ pi = cpu_seq_calc_pi(num_steps);
+ auto stop = T.Elapsed();
+ std::cout << "Cpu Seq calc: \t\t";
+ std::cout << std::setprecision(3) << "PI =" << pi;
+ std::cout << " in " << stop << " seconds"
+ << "\n";
+
+ dpc_common::TimeInterval T2;
+ pi = openmp_host_calc_pi(num_steps);
+ auto stop2 = T2.Elapsed();
+ std::cout << "Host OpenMP:\t\t";
+ std::cout << std::setprecision(3) << "PI =" << pi;
+ std::cout << " in " << stop2 << " seconds"
+ << "\n";
+
+ dpc_common::TimeInterval T3;
+ pi = openmp_device_calc_pi(num_steps);
+ auto stop3 = T3.Elapsed();
+ std::cout << "Offload OpenMP:\t\t";
+ std::cout << std::setprecision(3) << "PI =" << pi;
+ std::cout << " in " << stop3 << " seconds"
+ << "\n";
+
+ std::cout << "success\n";
+ return 0;
+}
diff --git a/DirectProgramming/DPC++/SpectralMethods/DiscreteCosineTransform/README.md b/DirectProgramming/DPC++/SpectralMethods/DiscreteCosineTransform/README.md
index fd706c0b84..482899704b 100644
--- a/DirectProgramming/DPC++/SpectralMethods/DiscreteCosineTransform/README.md
+++ b/DirectProgramming/DPC++/SpectralMethods/DiscreteCosineTransform/README.md
@@ -8,7 +8,7 @@ For comprehensive instructions regarding DPC++ Programming, go to https://softwa
|:--- |:---
| OS | Linux* Ubuntu* 18.04; Windows 10
| Hardware | Skylake with GEN9 or newer
-| Software | Intel® oneAPI DPC++ Compiler beta;
+| Software | Intel® oneAPI DPC++/C++ Compiler;
| What you will learn | How to parallel process image data using DPC++ for producing a Discrete Cosine Transform
| Time to complete | 15 minutes
diff --git a/DirectProgramming/DPC++/SpectralMethods/DiscreteCosineTransform/sample.json b/DirectProgramming/DPC++/SpectralMethods/DiscreteCosineTransform/sample.json
index a6ff50dad1..0f1a243409 100644
--- a/DirectProgramming/DPC++/SpectralMethods/DiscreteCosineTransform/sample.json
+++ b/DirectProgramming/DPC++/SpectralMethods/DiscreteCosineTransform/sample.json
@@ -1,7 +1,7 @@
{
"name": "Discrete Cosine Transform",
"description": "An image processing algorithm as seen in the JPEG compression standard.",
- "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU"],
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/Intel® oneAPI DPC++/C++ Compiler/CPU and GPU"],
"os": ["linux", "windows"],
"builder": ["ide", "cmake"],
"languages": [{"cpp":{}}],
diff --git a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/README.md b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/README.md
index 6459b25e05..346752f830 100644
--- a/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/README.md
+++ b/DirectProgramming/DPC++/StructuredGrids/1d_HeatTransfer/README.md
@@ -12,8 +12,8 @@ and search based on relevant terms noted in the comments.
|:--- |:---
| OS | Linux Ubuntu 18.04
| Hardware | Skylake with GEN9 or newer
-| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta)
-| What you will learn | How to simulate 1D Heat Transfer using Intel DPC++ compiler
+| Software | Intel® oneAPI DPC++/C++ Compiler
+| What you will learn | How to simulate 1D Heat Transfer using Intel® oneAPI DPC++/C++ Compiler
| Time to complete | 10 minutes
diff --git a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/README.md b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/README.md
index 604dd14b56..03b33a9171 100644
--- a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/README.md
+++ b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/README.md
@@ -17,8 +17,8 @@ and search based on relevant terms noted in the comments.
|:--- |:---
| OS | Linux Ubuntu 18.04
| Hardware | Skylake with GEN9 or newer
-| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta)
-| What you will learn | How to offload the computation to GPU using Intel DPC++ compiler
+| Software | Intel® oneAPI DPC++/C++ Compiler
+| What you will learn | How to offload the computation to GPU using Intel® oneAPI DPC++/C++ Compiler
| Time to complete | 10 minutes
@@ -53,9 +53,12 @@ global ID variable) for a single time step.
This code sample is licensed under MIT license.
-
## Building the `iso2dfd` Program for CPU and GPU
+### Include Files
+
+The include folder is located at %ONEAPI_ROOT%\dev-utilities\latest\include on your development system.
+
### Running Samples In DevCloud
If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU,
@@ -92,18 +95,6 @@ Perform the following steps:
Right click on the project in Solution explorer and select Rebuild.
From top menu select Debug -> Start without Debugging.
->If you see the following error message when compiling this sample:
->
-```
-Error 'dpc_common.hpp' file not found
-```
->You need to add the following directory to the list of include folders, that are required by your project, in your project's Visual Studio project property panel. The missing include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
-
-* Build the program using MSBuild
- Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for VS2019"
- Run - MSBuild iso2dfd.sln /t:Rebuild /p:Configuration="Release"
-
-
## Running the Sample
### Application Parameters
diff --git a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/src/iso2dfd.cpp b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/src/iso2dfd.cpp
index 710d87051b..62bd936ccf 100644
--- a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/src/iso2dfd.cpp
+++ b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/src/iso2dfd.cpp
@@ -31,7 +31,6 @@
#include
#include
#include
-#include
#include
#include
#include
diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/README.md b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/README.md
index 516f9c1ba6..67005704b9 100644
--- a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/README.md
+++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/README.md
@@ -8,11 +8,11 @@ For comprehensive instructions regarding DPC++ Programming, go to https://softwa
|:--- |:---
| OS | Linux* Ubuntu* 18.04; Windows 10
| Hardware | Skylake with GEN9 or newer
-| Software | Intel® oneAPI DPC++ Compiler beta;
-| What you will learn | How to offload the computation to GPU using Intel DPC++ compiler
+| Software | Intel® oneAPI DPC++/C++ Compiler;
+| What you will learn | How to offload the computation to GPU using Intel® oneAPI DPC++/C++ Compiler
| Time to complete | 15 minutes
-Performance number tabulation [if applicable -- **NO for beta**]
+Performance number tabulation
| iso3dfd sample | Performance data
|:--- |:---
diff --git a/DirectProgramming/DPC++/StructuredGrids/particle-diffusion/README.md b/DirectProgramming/DPC++/StructuredGrids/particle-diffusion/README.md
index 50c61fa567..e5a208706b 100644
--- a/DirectProgramming/DPC++/StructuredGrids/particle-diffusion/README.md
+++ b/DirectProgramming/DPC++/StructuredGrids/particle-diffusion/README.md
@@ -14,11 +14,11 @@ and search based on relevant terms noted in the comments.
|:--- |:---
| OS | Linux Ubuntu 18.04; Windows 10 or Windows Server 2017
| Hardware | Kaby Lake with GEN9 or newer
-| Software | Intel Data Parallel C++ Compiler (beta)
-| What you will learn | How to offload the computation to GPU using Intel DPC++ compiler
+| Software | Intel® oneAPI DPC++/C++ Compiler
+| What you will learn | How to offload the computation to GPU using Intel® oneAPI DPC++/C++ Compiler
| Time to complete | 15 minutes
-Performance number tabulation [if applicable]
+Performance number tabulation
| motionsim sample | Performance data
|:--- |:---
@@ -104,18 +104,6 @@ Perform the following steps:
Right click on the project in Solution explorer and select Rebuild
From top menu select Debug -> Start without Debugging
->If you see the following error message when compiling this sample:
->
-```
-Error 'dpc_common.hpp' file not found
-```
->You need to add the following directory to the list of include folders, that are required by your project, in your project's Visual Studio project property panel. The missing include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
-
- * Build the program using MSBuild
- Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for VS2019"
- Run - MSBuild Particle_Diffusion.sln /t:Rebuild /p:Configuration="Release"
-
-
## Running the Sample
### Application Parameters
diff --git a/DirectProgramming/DPC++/StructuredGrids/particle-diffusion/src/motionsim.cpp b/DirectProgramming/DPC++/StructuredGrids/particle-diffusion/src/motionsim.cpp
index fda492d9e0..efbdb7c728 100644
--- a/DirectProgramming/DPC++/StructuredGrids/particle-diffusion/src/motionsim.cpp
+++ b/DirectProgramming/DPC++/StructuredGrids/particle-diffusion/src/motionsim.cpp
@@ -25,7 +25,6 @@
//
#include
-#include
#include
#include
#include
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/CMakeLists.txt
new file mode 100755
index 0000000000..6ae6386d49
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(CRR)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/README.md b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/README.md
new file mode 100755
index 0000000000..ab98bae8d7
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/README.md
@@ -0,0 +1,224 @@
+# CRR Binomial Tree Model for Option Pricing
+An FPGA-optimized reference design computing the Cox-Ross-Rubinstein (CRR) binomial tree model with Greeks for American exercise options.
+
+The [FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a resource for general target-independent DPC++ programming. Additional reference material specific to option pricing algorithms is provided in the References section of this README.
+
+| Optimized for | Description
+--- |---
+| OS | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software | Intel® oneAPI DPC++ Compiler (Beta) Intel® FPGA Add-On for oneAPI Base Toolkit
+| What you will learn | Review a high performance DPC++ design optimized for FPGA
+| Time to complete | 1 hr (not including compile time)
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+
+**Performance**
+Please refer to performance disclaimer at the end of this README.
+
+| Device | Throughput
+|:--- |:---
+| Intel® PAC with Intel Arria® 10 GX FPGA | 118 assets/s
+| Intel® PAC with Intel Stratix® 10 SX FPGA | 243 assets/s
+
+
+## Purpose
+This sample implements the Cox-Ross-Rubinstein (CRR) binomial tree model that is used in the finance field for American exercise options with five Greeks (delta, gamma, theta, vega and rho). The simple idea is to model all possible assets price paths using a binomial tree.
+
+## Key Implementation Details
+
+### Design Inputs
+This design reads inputs from the `ordered_inputs.csv` file. The inputs are:
+
+| Input | Description
+--- |---
+| `n_steps` | Number of time steps in the binomial tree. The maximum `n_steps` in this design is 8189.
+| `cp` | -1 or 1 represents put and call options, respectively.
+| `spot` | Spot price of the underlying price.
+| `fwd` | Forward price of the underlying price.
+| `strike` | Exercise price of the option.
+| `vol` | Percent volatility that the design reads as a decimal value.
+| `df` | Discount factor to option expiry.
+| `t` | Time, in years, to the maturity of the option.
+
+### Design Outputs
+This design writes outputs to the `ordered_outputs.csv` file. The outputs are:
+
+| Output | Description
+--- |---
+| `value` | Option price
+| `delta` | Measures the rate of change of the theoretical option value with respect to changes in the underlying asset's price.
+| `gamma` | Measures the rate of change in the `delta` with respect to changes in the underlying price.
+| `vega` | Measures sensitivity to volatility.
+| `theta` | Measures the sensitivity of the value of the derivative to the passage of time.
+| `rho` | Measures sensitivity to the interest of rate.
+
+### Design Correctness
+This design tests the correctness of the optimized FPGA code by comparing its output to a golden result computed on the CPU.
+
+### Design Performance
+This design measures the FPGA performance to determine how many assets can be processed per second.
+
+## License
+This code sample is licensed under MIT license.
+
+## Building the CRR Program
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (FPGA) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 48h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+ ```
+ mkdir build
+ cd build
+ ```
+ To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:
+ ```
+ cmake ..
+ ```
+ Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+ ```
+ cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+ ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ make fpga_emu
+ ```
+ * Generate the optimization report:
+ ```
+ make report
+ ```
+ * Compile for FPGA hardware (longer compile time, targets FPGA device):
+ ```
+ make fpga
+ ```
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here .
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead.
+
+1. Enter the source file directory.
+ ```
+ cd src
+ ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ ninja fpga_emu
+ ```
+
+ * Generate the optimization report:
+
+ ```
+ ninja report
+ ```
+ If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+ ```
+ ninja report_s10_pac
+ ```
+ * Compiling for FPGA hardware is not yet supported on Windows.
+
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Running the Reference Design
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+ ```
+ ./crr.fpga_emu [-o=] (Linux)
+
+ crr.fpga_emu.exe [-o=] (Windows)
+ ```
+ 2. Run the sample on the FPGA device:
+ ```
+ ./crr.fpga [-o=] (Linux)
+ ```
+
+### Application Parameters
+
+| Argument | Description
+--- |---
+| `` | Optional argument that provides the input data. The default file is `/data/ordered_inputs.csv`
+| `-o=` | Optional argument that specifies the name of the output file. The default name of the output file is `ordered_outputs.csv`.
+
+### Example of Output
+```
+============ Correctness Test =============
+Running analytical correctness checks...
+CPU-FPGA Equivalence: PASS
+
+============ Throughput Test =============
+Avg throughput: 66.2 assets/s
+```
+
+## Additional Design Information
+
+### Source Code Explanation
+
+| File | Description
+--- |---
+| `main.cpp` | Contains both host code and SYCL* kernel code.
+| `CRR_common.hpp` | Header file for `main.cpp`. Contains the data structures needed for both host code and SYCL* kernel code.
+
+
+
+### Backend Compiler Flags Used
+
+| Flag | Description
+--- |---
+`-Xshardware` | Target FPGA hardware (as opposed to FPGA emulator)
+`-Xsdaz` | Denormals are zero
+`-Xsrounding=faithful` | Rounds results to either the upper or lower nearest single-precision numbers
+`-Xsparallel=2` | Uses 2 cores when compiling the bitstream through Quartus
+`-Xsseed=2` | Uses seed 2 during Quartus, yields slightly higher fMAX
+
+### Preprocessor Define Flags
+
+| Flag | Description
+--- |---
+`-DOUTER_UNROLL=1` | Uses the value 1 for the constant OUTER_UNROLL, controls the number of CRRs that can be processed in parallel
+`-DINNER_UNROLL=64` | Uses the value 64 for the constant INNER_UNROLL, controls the degree of parallelization within the calculation of 1 CRR
+`-DOUTER_UNROLL_POW2=1` | Uses the value 1 for the constant OUTER_UNROLL_POW2, controls the number of memory banks
+
+
+NOTE: The Xsseed, DOUTER_UNROLL, DINNER_UNROLL and DOUTER_UNROLL_POW2 values differ depending on the board being targeted. More information about the unroll factors can be found in `/src/CRR_common.hpp`.
+
+### Performance disclaimers
+
+Tests document performance of components on a particular test, in specific systems. Differences in hardware, software, or configuration will affect actual performance. Consult other sources of information to evaluate performance as you consider your purchase. For more complete information about performance and benchmark results, visit [www.intel.com/benchmarks](www.intel.com/benchmarks).
+
+Performance results are based on testing as of July 20, 2020 and may not reflect all publicly available security updates. See configuration disclosure for details. No product or component can be absolutely secure.
+
+Intel technologies’ features and benefits depend on system configuration and may require enabled hardware, software or service activation. Performance varies depending on system configuration. Check with your system manufacturer or retailer or learn more at [intel.com](www.intel.com).
+
+The performance was measured by Intel on July 20, 2020
+
+Intel and the Intel logo are trademarks of Intel Corporation or its subsidiaries in the U.S. and/or other countries.
+
+(C) Intel Corporation.
+
+### References
+
+[Khronous SYCL Resources](https://www.khronos.org/sycl/resources)
+
+[Binomial options pricing model](https://en.wikipedia.org/wiki/Binomial_options_pricing_model)
+
+[Wike page for finance Greeks](https://en.wikipedia.org/wiki/Greeks_(finance))
+
+[OpenCL Intercept Layer](https://github.com/intel/opencl-intercept-layer)
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.sln b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.sln
new file mode 100755
index 0000000000..a95fce9c30
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.sln
@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "crr", "crr.vcxproj", "{8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|x64 = Debug|x64
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}.Debug|x64.ActiveCfg = Debug|x64
+ {8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}.Debug|x64.Build.0 = Debug|x64
+ {8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}.Release|x64.ActiveCfg = Release|x64
+ {8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ SolutionGuid = {6887ACDD-3E54-4396-A921-99C630333932}
+ EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj
new file mode 100755
index 0000000000..62a523e96c
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj
@@ -0,0 +1,165 @@
+
+
+
+
+ Debug
+ x64
+
+
+ Release
+ x64
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 15.0
+ {8eb512ff-4487-4fec-9b88-8c0da734b1b2}
+ Win32Proj
+ crr
+ $(WindowsSDKVersion.Replace("\",""))
+
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true
+
+
+ true
+
+
+ false
+
+
+ false
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR -DOUTER_UNROLL=1 -DINNER_UNROLL=64 -DOUTER_UNROLL_POW2=1 %(AdditionalOptions)
+ false
+ $(IntDir)crr.obj
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR -DOUTER_UNROLL=1 -DINNER_UNROLL=64 -DOUTER_UNROLL_POW2=1 %(AdditionalOptions)
+ $(IntDir)crr.obj
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj.user b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj.user
new file mode 100755
index 0000000000..9115b3f275
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj.user
@@ -0,0 +1,14 @@
+
+
+
+ false
+
+
+ ./src/data/ordered_inputs.csv -o=./src/data/ordered_outputs.csv
+ WindowsLocalDebugger
+
+
+ ./src/data/ordered_inputs.csv -o=./src/data/ordered_outputs.csv
+ WindowsLocalDebugger
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/sample.json b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/sample.json
new file mode 100755
index 0000000000..6155ce223d
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/sample.json
@@ -0,0 +1,51 @@
+{
+ "guid": "D725E06E-0ECE-44F8-910D-AD1A8C89ED89",
+ "name": "CRR Binomial Tree Model for Option Pricing",
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Reference Designs"],
+ "description": "FPGA-optimized reference design of the Cox-Ross-Rubinstein (CRR) binomial tree model with Greeks for American exercise options",
+ "toolchain": ["dpcpp"],
+ "os": ["linux", "windows"],
+ "builder": ["ide", "cmake"],
+ "targetDevice": ["FPGA"],
+ "languages": [{"cpp":{}}],
+ "ciTests": {
+ "linux": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make fpga_emu",
+ "./crr.fpga_emu ./src/data/ordered_inputs.csv -o=./src/data/ordered_outputs.csv"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make report"
+ ]
+ }
+ ],
+ "windows": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "cd src",
+ "ninja fpga_emu",
+ "crr.fpga_emu.exe ./data/ordered_inputs.csv -o=./data/ordered_outputs.csv"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "cd src",
+ "ninja report"
+ ]
+ }
+ ]
+ }
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CMakeLists.txt
new file mode 100755
index 0000000000..8c56a699ad
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CMakeLists.txt
@@ -0,0 +1,116 @@
+set(SOURCE_FILE main.cpp)
+set(TARGET_NAME crr)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+set(REPORTS_TARGET ${TARGET_NAME}_report)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Design specific constant values
+set(OUTER_UNROLL_A10 1)
+set(INNER_UNROLL_A10 64)
+set(OUTER_UNROLL_POW2_A10 1)
+set(OUTER_UNROLL_S10 2)
+set(INNER_UNROLL_S10 64)
+set(OUTER_UNROLL_POW2_S10 2)
+set(SEED_A10 1)
+set(SEED_S10 2)
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+SET(OUTER_UNROLL ${OUTER_UNROLL_A10})
+SET(INNER_UNROLL ${INNER_UNROLL_A10})
+SET(OUTER_UNROLL_POW2 ${OUTER_UNROLL_POW2_A10})
+SET(SEED ${SEED_A10})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+ MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+ MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+ MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+ SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+ SET(OUTER_UNROLL ${OUTER_UNROLL_S10})
+ SET(INNER_UNROLL ${INNER_UNROLL_S10})
+ SET(OUTER_UNROLL_POW2 ${OUTER_UNROLL_POW2_S10})
+ SET(SEED ${SEED_S10})
+
+ELSE()
+ MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS -fintelfpga -c -DOUTER_UNROLL=${OUTER_UNROLL} -DINNER_UNROLL=${INNER_UNROLL} -DOUTER_UNROLL_POW2=${OUTER_UNROLL_POW2})
+
+# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation
+separate_arguments(USER_HARDWARE_FLAGS)
+set(HARDWARE_LINK_FLAGS -fintelfpga -Xshardware -Xsdaz -Xsrounding=faithful -Xsparallel=2 -Xsseed=${SEED} -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS} -DOUTER_UNROLL=${OUTER_UNROLL} -DINNER_UNROLL=${INNER_UNROLL} -DOUTER_UNROLL_POW2=${OUTER_UNROLL_POW2})
+set(FINAL_LINK_FLAGS -fintelfpga -DOUTER_UNROLL=${OUTER_UNROLL} -DINNER_UNROLL=${INNER_UNROLL} -DOUTER_UNROLL_POW2=${OUTER_UNROLL_POW2})
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR -DOUTER_UNROLL=${OUTER_UNROLL} -DINNER_UNROLL=${INNER_UNROLL} -DOUTER_UNROLL_POW2=${OUTER_UNROLL_POW2}")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+#copy input data
+configure_file("data/ordered_inputs.csv" "data/ordered_inputs.csv" COPYONLY)
+
+# fpga emulator
+if(WIN32)
+ set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+ add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+ separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+ add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+ DEPENDS ${SOURCE_FILE})
+else()
+ add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+ add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+ set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+ set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpgas
+if(WIN32)
+ add_custom_target(fpga
+ COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+ add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+ set(DEVICE_FPGA_OBJ "crr_fpga.o")
+
+ add_custom_command(OUTPUT ${DEVICE_FPGA_OBJ}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${DEVICE_FPGA_OBJ}
+ DEPENDS ${SOURCE_FILE})
+
+ add_custom_command(OUTPUT ${FPGA_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} ${DEVICE_FPGA_OBJ} -o ${CMAKE_BINARY_DIR}/${FPGA_TARGET}
+ DEPENDS ${DEVICE_FPGA_OBJ})
+endif()
+
+# fpga report
+if(WIN32)
+ add_custom_target(report DEPENDS ${REPORTS_TARGET} )
+
+ separate_arguments(WIN_FLAGS WINDOWS_COMMAND)
+ add_custom_command(OUTPUT ${REPORTS_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${WIN_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET}
+ DEPENDS ${SOURCE_FILE})
+
+else()
+ add_custom_target(report DEPENDS ${REPORTS_TARGET} )
+
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/CRR_common.hpp CRR_common.hpp COPYONLY)
+
+ add_custom_command(OUTPUT ${REPORTS_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET}
+ DEPENDS ${SOURCE_FILE} CRR_common.hpp)
+endif()
+
+# run
+add_custom_target(run
+ COMMAND ../${TARGET_NAME}.fpga_emu data/ordered_inputs.csv -o=data/ordered_output.csv
+ DEPENDS ${TARGET_NAME}.fpga_emu)
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CRR_common.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CRR_common.hpp
new file mode 100755
index 0000000000..6f2537e1e0
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CRR_common.hpp
@@ -0,0 +1,149 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#ifndef __CRR_COMMON_H__
+#define __CRR_COMMON_H__
+
+constexpr int kMaxStringLen = 1024;
+
+// Increments of kMaxNSteps
+constexpr size_t kMaxNSteps = 8189;
+constexpr size_t kMaxNSteps1 = 8190;
+constexpr size_t kMaxNSteps2 = 8191;
+constexpr size_t kMaxNSteps3 = 8192;
+
+// Increment by a small epsilon in order to compute derivative
+// of option price with respect to Vol or Interest. The derivatives
+// are then used to compute Vega and Rho.
+constexpr double kEpsilon = 0.0001;
+
+// Whenever calculations are made for Option Price 0, need to increment
+// nsteps by 2 to ensure all the required derivative prices are calculated.
+constexpr size_t kOpt0 = 2;
+
+
+// Solver configuration settings that are dependent on selected
+// board. Most notable settings are:
+
+// OUTER_UNROLL controls the number of CRRs that can be processed
+// in parallel in a SIMD fashion (number of CRRS must be >= OUTER_UNROLL).
+// This is ideally a power of two, but does not have to be. Since
+// the DRAM bandwidth requirement is low, increasing OUTER_UNROLL
+// should result in fairly linear speedup. (max: 32 on PAC A10)
+
+// INNER_UNROLL controls the degree of parallelization within
+// the calculation of a single CRR. This must be a power of two. Increasing
+// INNER_UNROLL has a lower area overhead than increasing OUTER_UNROLL;
+// however, there are diminishing returns as INNER_UNROLL is increased with
+// respect to the number of time steps. (max: 128 on PAC A10)
+
+
+// Data structure for original input data.
+typedef struct {
+ int cp; /* cp = -1 or 1 for Put & Call respectively. */
+ double n_steps; /* n_steps = number of time steps in the binomial tree. */
+ double strike; /* strike = exercise price of option. */
+ double spot; /* spot = spot price of the underlying. */
+ double fwd; /* fwd = forward price of the underlying. */
+ double vol; /* vol = per cent volatility, input as a decimal. */
+ double df; /* df = discount factor to option expiry. */
+ double t; /* t = time in years to the maturity of the option. */
+
+} InputData;
+
+// Data structure as the inputs to FPGA.
+// Element[i] is used to compute option_price[i].
+typedef struct {
+ double n_steps; /* n_steps = number of time steps in the binomial tree. */
+ double u[3]; /* u = the increase factor of a up movement in the binomial tree,
+ same for each time step. */
+ double u2[3]; /* u2 = the square of increase factor. */
+ double c1[3]; /* c1 = the probality of a down movement in the binomial tree,
+ same for each time step. */
+ double c2[3]; /* c2 = the probality of a up movement in the binomial tree. */
+ double umin[3]; /* umin = minimum price of the underlying at the maturity. */
+ double param_1[3];/* param_1[i] = cp * umin[i] */
+ double param_2; /* param_2 = cp * strike */
+
+} CRRInParams;
+
+// Data structure as the output from ProcessKernelResult().
+typedef struct {
+ double pgreek[4]; /* Stores the 4 derivative prices in the binomial tree
+ required to compute the Premium and Greeks. */
+ double vals[3]; /* Three option prices calculated */
+
+} InterRes;
+
+// Data structure for option price and five Greeks.
+typedef struct {
+ double value; /* value = option price. */
+ double delta;
+ double gamma;
+ double vega;
+ double theta;
+ double rho;
+} OutputRes;
+
+// Data structures required by the kernel
+typedef struct {
+ double u;
+ double c1;
+ double c2;
+ double param_1;
+ double param_2;
+ short n_steps;
+ short pad1;
+ int pad2;
+ double pad3;
+ double pad4;
+} CRRMeta;
+
+typedef struct {
+ double u2;
+ double p1powu;
+ double init_optval;
+ double pad;
+} ArrayEle;
+
+typedef struct {
+ ArrayEle array_eles[kMaxNSteps3][3]; /* Second dimension size set to 3 to have a
+ separate ArrayEle for each option price */
+} CRRArrayEles;
+
+typedef struct {
+ ArrayEle array_eles[kMaxNSteps3];
+} CRRPerStepMeta;
+
+typedef struct {
+ double pgreek[4];
+ double optval0;
+ double pad[3];
+} CRRResParams;
+
+#endif
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/build.ninja b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/build.ninja
new file mode 100755
index 0000000000..58af917f67
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/build.ninja
@@ -0,0 +1,35 @@
+source_file = main.cpp
+target_name = crr
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware -Xsfpc -Xsparallel=2 -Xsseed=5
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+a10_flags = -DOUTER_UNROLL=1 -DINNER_UNROLL=64 -DOUTER_UNROLL_POW2=1
+s10_flags = -DOUTER_UNROLL=2 -DINNER_UNROLL=64 -DOUTER_UNROLL_POW2=2
+
+rule build_fpga_emu
+ command = dpcpp /GX ${emulator_flags} ${a10_flags} $in -o $out
+
+rule build_fpga_emu_s10
+ command = dpcpp /GX ${emulator_flags} -Xsboard=intel_s10sx_pac:pac_s10 ${s10_flags} $in -o $out
+
+rule gen_report
+ command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 ${a10_flags} -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+ command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 ${s10_flags} -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/data/ordered_inputs.csv b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/data/ordered_inputs.csv
new file mode 100755
index 0000000000..3a28083fa2
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/data/ordered_inputs.csv
@@ -0,0 +1,10 @@
+8189,-1,37.5,37.50112053,85,0.4,0.99997012,0.011952191
+8189,1,37.5,37.50112053,85,0.4,0.99997012,0.011952191
+8189,-1,270,270.0080678,65,0.18,0.999940241,0.011952191
+8189,1,270,270.0080678,65,0.18,0.999940241,0.011952191
+8189,-1,292.5,292.5087402,70,0.35,0.999940241,0.011952191
+8189,1,292.5,292.5087402,70,0.35,0.999940241,0.011952191
+8189,-1,122.5,122.5109816,40,0.2,0.999910363,0.011952191
+8189,1,122.5,122.5109816,40,0.2,0.999910363,0.011952191
+8189,-1,22.5,22.50067232,55,0.3,0.999910363,0.011952191
+8189,1,22.5,22.50067232,55,0.3,0.999910363,0.011952191
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/main.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/main.cpp
new file mode 100755
index 0000000000..7c92610e19
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/main.cpp
@@ -0,0 +1,849 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// CRRSolver CPU/FPGA Accelerator Demo Program
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// This design implments simple Cox-Ross-Rubinstein(CRR) binomial tree model
+// with Greeks for American exercise options.
+//
+//
+// Optimization summary:
+// -- Area-consuming but infrequent calculation is done on CPU.
+// -- Parallelize the calculation of a single CRR.
+// -- Run multiple independent CRRs in parallel.
+// -- Optimized memory configurations to reduce the need for replication
+// and to eliminate the need for double-pumping M20Ks.
+//
+// The following diagram shows the mechanism of optimizations to CRR.
+//
+//
+// +------+ ^
+// +------------>|optval| |
+// | | [2] | |
+// | +------+ |
+// | |
+// | |
+// +--+---+ |
+// +------------>|optval| |
+// | | [1] | |
+// | +--+---+ |
+// | | |
+// | | |
+// | | | Loop4(L4)
+// | | | updates
+// +---+--+ +------------>+------+ | multiple
+// |optval| |optval| | elements
+// | [0] | | [1] | | in optval[]
+// +---+--+ +------------>+------+ | simultaneously
+// | | |
+// | | |
+// | | |
+// | | |
+// | +--+---+ |
+// | |optval| |
+// +------------>| [0] | |
+// +--+---+ |
+// | |
+// | |
+// | +------+ |
+// | |optval| |
+// +------------>| [0] | |
+// +------+ +
+//
+//
+//
+//
+// step 1 step 2
+//
+//
+// <------------------------------------------+
+// Loop3(L3) updates each level of the tree
+//
+//
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "CRR_common.hpp"
+#include "dpc_common.hpp"
+
+using namespace std;
+using namespace sycl;
+
+class CRRSolver;
+double CrrSolver(const int n_items, vector &in_params,
+ vector &res_params,
+ vector &in_params2, queue &q) {
+ dpc_common::TimeInterval timer;
+
+ constexpr int steps = kMaxNSteps2;
+
+ const int n_crr =
+ (((n_items + (OUTER_UNROLL - 1)) / OUTER_UNROLL) * OUTER_UNROLL) * 3;
+
+ {
+ buffer i_params(in_params.data(), in_params.size());
+ buffer r_params(res_params.data(), res_params.size());
+ buffer a_params(in_params2.data(), in_params2.size());
+
+ event e;
+ {
+ e = q.submit([&](handler &h) {
+ auto accessor_v =
+ i_params.template get_access(h);
+
+ auto accessor_v2 =
+ a_params.template get_access(h);
+
+ auto accessor_r =
+ r_params.template get_access(h);
+
+ h.single_task([=]() [[intel::kernel_args_restrict]] {
+ // Kernel requires n_crr to be a multiple of OUTER_UNROLL.
+ // This is taken care of by the host.
+ const int n_crr_div = n_crr / OUTER_UNROLL;
+
+ // Outerloop counter. Use while-loop for better timing-closure
+ // characteristics because it tells the compiler the loop body will
+ // never be skipped.
+ int oc = 0;
+ do {
+ // Metadata of CRR problems
+ [[intelfpga::register]] double u[OUTER_UNROLL];
+ [[intelfpga::register]] double c1[OUTER_UNROLL];
+ [[intelfpga::register]] double c2[OUTER_UNROLL];
+ [[intelfpga::register]] double param_1[OUTER_UNROLL];
+ [[intelfpga::register]] double param_2[OUTER_UNROLL];
+ [[intelfpga::register]] short n_steps[OUTER_UNROLL];
+
+ // Current values in binomial tree. We only need to keep track of
+ // one level worth of data, not the entire tree.
+ [[intelfpga::memory, intelfpga::singlepump,
+ intelfpga::bankwidth(sizeof(double)),
+ intelfpga::numbanks(INNER_UNROLL * OUTER_UNROLL_POW2),
+ intelfpga::private_copies(
+ 8)]] double optval[kMaxNSteps3][OUTER_UNROLL_POW2];
+
+ // Initial values in binomial tree, which correspond to the last
+ // level of the binomial tree.
+ [[intelfpga::memory, intelfpga::singlepump,
+ intelfpga::bankwidth(sizeof(double)),
+ intelfpga::numbanks(INNER_UNROLL * OUTER_UNROLL_POW2),
+ intelfpga::private_copies(
+ 8)]] double init_optval[kMaxNSteps3][OUTER_UNROLL_POW2];
+
+ // u2_array precalculates the power function of u2.
+ [[intelfpga::memory, intelfpga::singlepump,
+ intelfpga::bankwidth(sizeof(double)),
+ intelfpga::numbanks(INNER_UNROLL * OUTER_UNROLL_POW2),
+ intelfpga::private_copies(
+ 8)]] double u2_array[kMaxNSteps3][OUTER_UNROLL_POW2];
+
+ // p1powu_array precalculates p1 multipy the power of u.
+ [[intelfpga::memory, intelfpga::singlepump,
+ intelfpga::bankwidth(sizeof(double)),
+ intelfpga::numbanks(INNER_UNROLL * OUTER_UNROLL_POW2),
+ intelfpga::private_copies(
+ 8)]] double p1powu_array[kMaxNSteps3][OUTER_UNROLL_POW2];
+
+ // n0_optval stores the binomial tree value corresponding to node 0
+ // of a level. This is the same as what's stored in
+ // optval/init_optval, but replicating this data allows us to have
+ // only one read port for optval and init_optval, thereby removing
+ // the need of double-pumping or replication. n0_optval_2 is a copy
+ // of n0_optval that stores the node 0 value for a specific layer of
+ // the tree. pgreek is the array saving values for post-calculating
+ // Greeks.
+ [[intelfpga::register]] double n0_optval[OUTER_UNROLL];
+ [[intelfpga::register]] double n0_optval_2[OUTER_UNROLL];
+ [[intelfpga::register]] double pgreek[4][OUTER_UNROLL];
+
+ // L1 + L2:
+ // Populate init_optval -- calculate the last level of the binomial
+ // tree.
+ for (short ic = 0; ic < OUTER_UNROLL; ++ic) {
+ // Transfer data from DRAM to local memory or registers
+ const int c = oc * OUTER_UNROLL + ic;
+ const CRRMeta param = accessor_v[c];
+
+ u[ic] = param.u;
+ c1[ic] = param.c1;
+ c2[ic] = param.c2;
+ param_1[ic] = param.param_1;
+ param_2[ic] = param.param_2;
+ n_steps[ic] = param.n_steps;
+
+ for (short t = steps; t >= 0; --t) {
+ const ArrayEle param_array = accessor_v2[c].array_eles[t];
+
+ const double init_val = param_array.init_optval;
+
+ init_optval[t][ic] = init_val;
+
+ // n0_optval intends to store the node value at t == 0.
+ // Instead of qualifying this statement by an "if (t == 0)",
+ // which couples the loop counter to the timing path of the
+ // assignment, we reverse the loop direction so the last value
+ // stored corresponds to t == 0.
+ n0_optval[ic] = init_val;
+
+ // Transfer data from DRAM to local memory or registers
+ u2_array[t][ic] = param_array.u2;
+ p1powu_array[t][ic] = param_array.p1powu;
+ }
+ }
+
+ // L3:
+ // Update optval[] -- calculate each level of the binomial tree.
+ // reg[] helps to achieve updating INNER_UNROLL elements in optval[]
+ // simultaneously.
+ [[intelfpga::disable_loop_pipelining]] for (short t = 0;
+ t <= steps - 1; ++t) {
+ [[intelfpga::register]] double reg[INNER_UNROLL + 1][OUTER_UNROLL];
+
+ double val_1, val_2;
+
+ #pragma unroll
+ for (short ic = 0; ic < OUTER_UNROLL; ++ic) {
+ reg[0][ic] = n0_optval[ic];
+ }
+
+ // L4:
+ // Calculate all the elements in optval[] -- all the tree nodes
+ // for one level of the tree
+ [[intelfpga::ivdep]] for (int n = 0; n <= steps - 1 - t;
+ n += INNER_UNROLL) {
+
+ #pragma unroll
+ for (short ic = 0; ic < OUTER_UNROLL; ++ic) {
+
+ #pragma unroll
+ for (short ri = 1; ri <= INNER_UNROLL; ++ri) {
+ reg[ri][ic] =
+ (t == 0) ? init_optval[n + ri][ic] : optval[n + ri][ic];
+ }
+
+ #pragma unroll
+ for (short ri = 0; ri < INNER_UNROLL; ++ri) {
+ const double val = sycl::fmax(
+ c1[ic] * reg[ri][ic] + c2[ic] * reg[ri + 1][ic],
+ p1powu_array[t][ic] * u2_array[n + ri][ic] -
+ param_2[ic]);
+
+ optval[n + ri][ic] = val;
+ if (n + ri == 0) {
+ n0_optval[ic] = val;
+ }
+ if (n + ri == 1) {
+ val_1 = val;
+ }
+ if (n + ri == 2) {
+ val_2 = val;
+ }
+ }
+
+ reg[0][ic] = reg[INNER_UNROLL][ic];
+
+ if (t == steps - 5) {
+ pgreek[3][ic] = val_2;
+ }
+ if (t == steps - 3) {
+ pgreek[0][ic] = n0_optval[ic];
+ pgreek[1][ic] = val_1;
+ pgreek[2][ic] = val_2;
+ n0_optval_2[ic] = n0_optval[ic];
+ }
+ }
+ }
+ }
+
+ // L5: transfer crr_res_paramss to DRAM
+ #pragma unroll
+ for (short ic = 0; ic < OUTER_UNROLL; ++ic) {
+ const int c = oc * OUTER_UNROLL + ic;
+ if (n_steps[ic] < steps) {
+ accessor_r[c].optval0 = n0_optval_2[ic];
+ } else {
+ accessor_r[c].optval0 = n0_optval[ic];
+ }
+ accessor_r[c].pgreek[0] = pgreek[0][ic];
+ accessor_r[c].pgreek[1] = pgreek[1][ic];
+ accessor_r[c].pgreek[2] = pgreek[2][ic];
+ accessor_r[c].pgreek[3] = pgreek[3][ic];
+ }
+ // Increment counters
+ oc += 1;
+ } while (oc < n_crr_div);
+ });
+ });
+ }
+ }
+
+ double diff = timer.Elapsed();
+ return diff;
+}
+
+void ReadInputFromFile(ifstream &input_file, vector &inp) {
+ string line_of_args;
+ while (getline(input_file, line_of_args)) {
+ InputData temp;
+ istringstream line_of_args_ss(line_of_args);
+ line_of_args_ss >> temp.n_steps;
+ line_of_args_ss.ignore(1, ',');
+ line_of_args_ss >> temp.cp;
+ line_of_args_ss.ignore(1, ',');
+ line_of_args_ss >> temp.spot;
+ line_of_args_ss.ignore(1, ',');
+ line_of_args_ss >> temp.fwd;
+ line_of_args_ss.ignore(1, ',');
+ line_of_args_ss >> temp.strike;
+ line_of_args_ss.ignore(1, ',');
+ line_of_args_ss >> temp.vol;
+ line_of_args_ss.ignore(1, ',');
+ line_of_args_ss >> temp.df;
+ line_of_args_ss.ignore(1, ',');
+ line_of_args_ss >> temp.t;
+
+ inp.push_back(temp);
+ }
+}
+
+static string ToStringWithPrecision(const double value, const int p = 6) {
+ ostringstream out;
+ out.precision(p);
+ out << std::fixed << value;
+ return out.str();
+}
+
+void WriteOutputToFile(ofstream &output_file, const vector &outp) {
+ size_t n = outp.size();
+ for (size_t i = 0; i < n; ++i) {
+ OutputRes temp;
+ temp = outp[i];
+ string line = ToStringWithPrecision(temp.value, 12) + " " +
+ ToStringWithPrecision(temp.delta, 12) + " " +
+ ToStringWithPrecision(temp.gamma, 12) + " " +
+ ToStringWithPrecision(temp.vega, 12) + " " +
+ ToStringWithPrecision(temp.theta, 12) + " " +
+ ToStringWithPrecision(temp.rho, 12) + "\n";
+
+ output_file << line;
+ }
+}
+
+bool FindGetArgString(const string &arg, const char *str, char *str_value,
+ size_t maxchars) {
+ size_t found = arg.find(str, 0, strlen(str));
+ if (found != string::npos) {
+ const char *sptr = &arg.c_str()[strlen(str)];
+ for (int i = 0; i < maxchars - 1; i++) {
+ char ch = sptr[i];
+ switch (ch) {
+ case ' ':
+ case '\t':
+ case '\0':
+ str_value[i] = 0;
+ return true;
+ break;
+ default:
+ str_value[i] = ch;
+ break;
+ }
+ }
+ return true;
+ }
+ return false;
+}
+
+// Perform data pre-processing work
+// Three different option prices are required to solve each CRR problem
+// The following lists why each option price is required:
+// [0] : Used to compute Premium, Delta, Gamma and Theta
+// [1] : Used to compute Rho
+// [2] : Used to compute Vega
+CRRInParams PrepareData(const InputData &inp) {
+ CRRInParams in_params;
+ in_params.n_steps = inp.n_steps;
+
+ double r[2];
+ r[0] = pow(inp.df, 1.0 / inp.n_steps);
+ double d_df = exp(-inp.t * kEpsilon);
+ r[1] = pow(inp.df * d_df, 1.0 / inp.n_steps);
+ in_params.u[0] = exp(inp.vol * sqrt(inp.t / inp.n_steps));
+ in_params.u[1] = in_params.u[0];
+ in_params.u[2] = exp((inp.vol + kEpsilon) * sqrt(inp.t / inp.n_steps));
+
+ in_params.u2[0] = in_params.u[0] * in_params.u[0];
+ in_params.u2[1] = in_params.u[1] * in_params.u[1];
+ in_params.u2[2] = in_params.u[2] * in_params.u[2];
+ in_params.umin[0] = inp.spot * pow(1 / in_params.u[0], inp.n_steps + kOpt0);
+ in_params.umin[1] = inp.spot * pow(1 / in_params.u[1], inp.n_steps);
+ in_params.umin[2] = inp.spot * pow(1 / in_params.u[2], inp.n_steps);
+ in_params.c1[0] =
+ r[0] * (in_params.u[0] - pow(inp.fwd / inp.spot, 1.0 / inp.n_steps)) /
+ (in_params.u[0] - 1 / in_params.u[0]);
+ in_params.c1[1] =
+ r[1] *(in_params.u[1] - pow((inp.fwd / d_df) / inp.spot, 1.0 / inp.n_steps)) /
+ (in_params.u[1] - 1 / in_params.u[1]);
+ in_params.c1[2] =
+ r[0] * (in_params.u[2] - pow(inp.fwd / inp.spot, 1.0 / inp.n_steps)) /
+ (in_params.u[2] - 1 / in_params.u[2]);
+ in_params.c2[0] = r[0] - in_params.c1[0];
+ in_params.c2[1] = r[1] - in_params.c1[1];
+ in_params.c2[2] = r[0] - in_params.c1[2];
+
+ in_params.param_1[0] = inp.cp * in_params.umin[0];
+ in_params.param_1[1] = inp.cp * in_params.umin[1];
+ in_params.param_1[2] = inp.cp * in_params.umin[2];
+ in_params.param_2 = inp.cp * inp.strike;
+
+ return in_params;
+}
+
+CRRArrayEles PrepareArrData(const CRRInParams &in) {
+ CRRArrayEles arr;
+
+ // Write in reverse t-direction to match kernel access pattern
+ for (int i = 0; i <= in.n_steps + kOpt0; ++i) {
+ for (int inner_func_index = 0; inner_func_index < 3; ++inner_func_index) {
+ arr.array_eles[i][inner_func_index].u2 = pow(in.u2[inner_func_index], i);
+ arr.array_eles[i][inner_func_index].p1powu =
+ in.param_1[inner_func_index] * pow(in.u[inner_func_index], i + 1);
+ arr.array_eles[i][inner_func_index].init_optval =
+ fmax(in.param_1[inner_func_index] * pow(in.u2[inner_func_index], i) -
+ in.param_2, 0.0);
+ }
+ }
+
+ return arr;
+}
+
+// Metadata, used in the Kernel, is generated from the input data
+// Each CRR problem is split into 3 subproblems to calculate
+// each required option price separately
+void PrepareKernelData(vector &in_params,
+ vector &array_params,
+ vector &in_buff_params,
+ vector &in_buff2_params,
+ const int n_crrs) {
+
+ constexpr short offset = 0;
+
+ for (int wi_idx = offset, dst = offset * 3; wi_idx < n_crrs; ++wi_idx) {
+ CRRInParams &src_crr_params = in_params[wi_idx];
+
+ CRRArrayEles &src_crr_eles = array_params[wi_idx];
+
+ for (int inner_func_index = 0; inner_func_index < 3;
+ ++inner_func_index, ++dst) {
+ CRRMeta &dst_crr_meta = in_buff_params[dst];
+ CRRPerStepMeta &dst_crr_per_step_meta = in_buff2_params[dst];
+
+ dst_crr_meta.u = src_crr_params.u[inner_func_index];
+ dst_crr_meta.c1 = src_crr_params.c1[inner_func_index];
+ dst_crr_meta.c2 = src_crr_params.c2[inner_func_index];
+
+ dst_crr_meta.param_1 = src_crr_params.param_1[inner_func_index];
+ dst_crr_meta.param_2 = src_crr_params.param_2;
+
+ if (inner_func_index == 0) {
+ dst_crr_meta.n_steps = src_crr_params.n_steps + kOpt0;
+ } else {
+ dst_crr_meta.n_steps = src_crr_params.n_steps;
+ }
+ for (int i = 0; i <= kMaxNSteps2; ++i) {
+ dst_crr_per_step_meta.array_eles[i].u2 =
+ src_crr_eles.array_eles[i][inner_func_index].u2;
+ dst_crr_per_step_meta.array_eles[i].p1powu =
+ src_crr_eles.array_eles[i][inner_func_index].p1powu;
+ dst_crr_per_step_meta.array_eles[i].init_optval =
+ src_crr_eles.array_eles[i][inner_func_index].init_optval;
+ }
+ }
+ }
+}
+
+// Takes in the result from the kernel and stores the 3 option prices
+// belonging to the same CRR problem in one InterRes element
+void ProcessKernelResult(const vector &res_params,
+ vector &postp_buff, const int n_crrs) {
+ constexpr int offset = 0;
+
+ for (int wi_idx = offset, src = offset * 3; wi_idx < n_crrs; ++wi_idx) {
+ InterRes &dst_res = postp_buff[wi_idx];
+
+ for (int inner_func_index = 0; inner_func_index < 3;
+ ++inner_func_index, ++src) {
+ const CRRResParams &src_res = res_params[src];
+
+ for (int i = 0; i < 4; ++i) {
+ if (inner_func_index == 0) {
+ dst_res.pgreek[i] = src_res.pgreek[i];
+ }
+ }
+
+ dst_res.vals[inner_func_index] = src_res.optval0;
+ }
+ }
+}
+
+// Computes the Premium and Greeks
+OutputRes ComputeOutput(const InputData &inp, const CRRInParams &in_params,
+ const InterRes &res_params) {
+ double h;
+ OutputRes res;
+ h = inp.spot * (in_params.u2[0] - 1 / in_params.u2[0]);
+ res.value = res_params.pgreek[1];
+ res.delta = (res_params.pgreek[2] - res_params.pgreek[0]) / h;
+ res.gamma = 2 / h *
+ ((res_params.pgreek[2] - res_params.pgreek[1]) / inp.spot /
+ (in_params.u2[0] - 1) -
+ (res_params.pgreek[1] - res_params.pgreek[0]) / inp.spot /
+ (1 - (1 / in_params.u2[0])));
+ res.theta =
+ (res_params.vals[0] - res_params.pgreek[3]) / 4 / inp.t * inp.n_steps;
+ res.rho = (res_params.vals[1] - res.value) / kEpsilon;
+ res.vega = (res_params.vals[2] - res.value) / kEpsilon;
+ return res;
+}
+
+// Perform CRR solving using the CPU and compare FPGA resutls with CPU results
+// to test correctness.
+void TestCorrectness(int k, int n_crrs, bool &pass, const InputData &inp,
+ CRRInParams &vals, const OutputRes &fpga_res) {
+ if (k == 0) {
+ std::cout << "\n============= Correctness Test ============= \n";
+ std::cout << "Running analytical correctness checks... \n";
+ }
+
+ // This CRR benchmark ensures a minimum 4 decimal points match between FPGA and CPU
+ // "threshold" is chosen to enforce this guarantee
+ float threshold = 0.00001;
+ int i, j, q;
+ double x;
+ int n_steps = vals.n_steps;
+ int m = n_steps + kOpt0;
+ vector pvalue(kMaxNSteps3);
+ vector pvalue_1(kMaxNSteps1);
+ vector pvalue_2(kMaxNSteps1);
+ vector pgreek(5);
+ InterRes cpu_res_params;
+ OutputRes cpu_res;
+
+ // option value computed at each final node
+ x = vals.umin[0];
+ for (i = 0; i <= m; i++, x *= vals.u2[0]) {
+ pvalue[i] = fmax(inp.cp * (x - inp.strike), 0.0);
+ }
+
+ // backward recursion to evaluate option price
+ for (i = m - 1; i >= 0; i--) {
+ vals.umin[0] *= vals.u[0];
+ x = vals.umin[0];
+ for (j = 0; j <= i; j++, x *= vals.u2[0]) {
+ pvalue[j] = fmax(vals.c1[0] * pvalue[j] + vals.c2[0] * pvalue[j + 1],
+ inp.cp * (x - inp.strike));
+ }
+ if (i == 4) {
+ pgreek[4] = pvalue[2];
+ }
+ if (i == 2) {
+ for (q = 0; q <= 2; q++) {
+ pgreek[q + 1] = pvalue[q];
+ }
+ }
+ }
+ cpu_res_params.vals[0] = pvalue[0];
+
+ // the above computation is repeated for each option price
+ x = vals.umin[1];
+ for (i = 0; i <= n_steps; i++, x *= vals.u2[1]) {
+ pvalue_1[i] = fmax(inp.cp * (x - inp.strike), 0.0);
+ }
+
+ for (i = n_steps - 1; i >= 0; i--) {
+ vals.umin[1] *= vals.u[1];
+ x = vals.umin[1];
+
+ for (j = 0; j <= i; j++, x *= vals.u2[1]) {
+ pvalue_1[j] =
+ fmax(vals.c1[1] * pvalue_1[j] + vals.c2[1] * pvalue_1[j + 1],
+ inp.cp * (x - inp.strike));
+ }
+ }
+ cpu_res_params.vals[1] = pvalue_1[0];
+
+ x = vals.umin[2];
+ for (i = 0; i <= n_steps; i++, x *= vals.u2[2]) {
+ pvalue_2[i] = fmax(inp.cp * (x - inp.strike), 0.0);
+ }
+
+ for (i = n_steps - 1; i >= 0; i--) {
+ vals.umin[2] *= vals.u[2];
+ x = vals.umin[2];
+ for (j = 0; j <= i; j++, x *= vals.u2[2]) {
+ pvalue_2[j] =
+ fmax(vals.c1[2] * pvalue_2[j] + vals.c2[2] * pvalue_2[j + 1],
+ inp.cp * (x - inp.strike));
+ }
+ }
+ cpu_res_params.vals[2] = pvalue_2[0];
+ pgreek[0] = 0;
+
+ for (i = 1; i < 5; ++i) {
+ cpu_res_params.pgreek[i - 1] = pgreek[i];
+ }
+
+ cpu_res = ComputeOutput(inp, vals, cpu_res_params);
+
+ if (abs(cpu_res.value - fpga_res.value) > threshold) {
+ pass = false;
+ std::cout << "fpga_res.value " << k << " = " << std::fixed
+ << std::setprecision(20) << fpga_res.value << "\n";
+ std::cout << "cpu_res.value " << k << " = " << std::fixed
+ << std::setprecision(20) << cpu_res.value << "\n";
+ std::cout << "Mismatch detected for value of crr " << k << "\n";
+ }
+ if (abs(cpu_res.delta - fpga_res.delta) > threshold) {
+ pass = false;
+ std::cout << "fpga_res.delta " << k << " = " << std::fixed
+ << std::setprecision(20) << fpga_res.delta << "\n";
+ std::cout << "cpu_res.delta " << k << " = " << std::fixed
+ << std::setprecision(20) << cpu_res.delta << "\n";
+ std::cout << "Mismatch detected for value of crr " << k << "\n";
+ }
+ if (abs(cpu_res.gamma - fpga_res.gamma) > threshold) {
+ pass = false;
+ std::cout << "fpga_res.gamma " << k << " = " << std::fixed
+ << std::setprecision(20) << fpga_res.gamma << "\n";
+ std::cout << "cpu_res.gamma " << k << " = " << std::fixed
+ << std::setprecision(20) << cpu_res.gamma << "\n";
+ std::cout << "Mismatch detected for value of crr " << k << "\n";
+ }
+ if (abs(cpu_res.vega - fpga_res.vega) > threshold) {
+ pass = false;
+ std::cout << "fpga_res.vega " << k << " = " << std::fixed
+ << std::setprecision(20) << fpga_res.vega << "\n";
+ std::cout << "cpu_res.vega " << k << " = " << std::fixed
+ << std::setprecision(20) << cpu_res.vega << "\n";
+ std::cout << "Mismatch detected for value of crr " << k << "\n";
+ }
+ if (abs(cpu_res.theta - fpga_res.theta) > threshold) {
+ pass = false;
+ std::cout << "fpga_res.theta " << k << " = " << std::fixed
+ << std::setprecision(20) << fpga_res.theta << "\n";
+ std::cout << "cpu_res.theta " << k << " = " << std::fixed
+ << std::setprecision(20) << cpu_res.theta << "\n";
+ std::cout << "Mismatch detected for value of crr " << k << "\n";
+ }
+ if (abs(cpu_res.rho - fpga_res.rho) > threshold) {
+ pass = false;
+ std::cout << "fpga_res.rho " << k << " = " << std::fixed
+ << std::setprecision(20) << fpga_res.rho << "\n";
+ std::cout << "cpu_res.rho " << k << " = " << std::fixed
+ << std::setprecision(20) << cpu_res.rho << "\n";
+ std::cout << "Mismatch detected for value of crr " << k << "\n";
+ }
+
+ if (k == n_crrs - 1) {
+ std::cout << "CPU-FPGA Equivalence: " << (pass ? "PASS" : "FAIL") << "\n";
+ }
+}
+
+// Print out the achieved CRR throughput
+void TestThroughput(const double &time, const int &n_crrs) {
+ std::cout << "\n============= Throughput Test =============\n";
+
+ std::cout << " Avg throughput: " << std::fixed << std::setprecision(1)
+ << (n_crrs / time) << " assets/s\n";
+}
+
+int main(int argc, char *argv[]) {
+ string infilename = "";
+ string outfilename = "";
+
+ const string default_ifile = "src/data/ordered_inputs.csv";
+ const string default_ofile = "src/data/ordered_outputs.csv";
+
+ char str_buffer[kMaxStringLen] = {0};
+ for (int i = 1; i < argc; i++) {
+ if (argv[i][0] == '-') {
+ string sarg(argv[i]);
+
+ FindGetArgString(sarg, "-o=", str_buffer, kMaxStringLen);
+ FindGetArgString(sarg, "--output-file=", str_buffer, kMaxStringLen);
+ } else {
+ infilename = string(argv[i]);
+ }
+ }
+
+ try {
+#if defined(FPGA_EMULATOR)
+ intel::fpga_emulator_selector device_selector;
+#else
+ intel::fpga_selector device_selector;
+#endif
+
+ queue q(device_selector, dpc_common::exception_handler);
+
+ std::cout << "Running on device: "
+ << q.get_device().get_info().c_str() << "\n";
+
+ device device = q.get_device();
+ std::cout << "Device name: "
+ << device.get_info().c_str() << "\n \n \n";
+
+ vector inp;
+
+ // Get input file name, if users don't have their test input file, this
+ // design will use the default input file
+ if (infilename == "") {
+ infilename = default_ifile;
+ }
+ ifstream inputFile(infilename);
+
+ if (!inputFile.is_open()) {
+ std::cerr << "Input file doesn't exist \n";
+ return 1;
+ }
+
+ // Check input file format
+ string filename = infilename;
+ std::size_t found = filename.find_last_of(".");
+ if (!(filename.substr(found + 1).compare("csv") == 0)) {
+ std::cerr << "Input file format only support .csv\n";
+ return 1;
+ }
+
+ // Get output file name, if users don't define output file name, the design
+ // will use the default output file
+ outfilename = default_ofile;
+ if (strlen(str_buffer)) {
+ outfilename = string(str_buffer);
+ }
+
+ // Check output file format
+ filename = outfilename;
+ found = filename.find_last_of(".");
+ if (!(filename.substr(found + 1).compare("csv") == 0)) {
+ std::cerr << "Output file format only support .csv\n";
+ return 1;
+ }
+
+ // Read inputs data from input file
+ ReadInputFromFile(inputFile, inp);
+
+// Get the number of data from the input file
+// Emulator mode only goes through one input (or through OUTER_UNROLL inputs) to
+// ensure fast runtime
+#if defined(FPGA_EMULATOR)
+ int temp_crrs = 1;
+#else
+ int temp_crrs = inp.size();
+#endif
+
+ // Check if n_crrs >= OUTER_UNROLL
+ if (OUTER_UNROLL >= temp_crrs) {
+ if (inp.size() < OUTER_UNROLL) {
+ std::cerr << "Input size must be greater than or equal to OUTER_UNROLL\n";
+ return 1;
+ } else {
+ temp_crrs = OUTER_UNROLL;
+ }
+ }
+
+ const int n_crrs = temp_crrs;
+
+ vector in_params(n_crrs);
+ vector array_params(n_crrs);
+
+ for (int j = 0; j < n_crrs; ++j) {
+ in_params[j] = PrepareData(inp[j]);
+ array_params[j] = PrepareArrData(in_params[j]);
+ }
+
+ // following vectors are arguments for CrrSolver
+ vector in_buff_params(n_crrs * 3);
+ vector in_buff2_params(n_crrs * 3);
+
+ vector res_params(n_crrs * 3);
+ vector res_params_dummy(n_crrs * 3);
+
+ // Prepare metadata as input to kernel
+ PrepareKernelData(in_params, array_params, in_buff_params, in_buff2_params,
+ n_crrs);
+
+ // warmup run - use this run to warmup accelerator
+ CrrSolver(n_crrs, in_buff_params, res_params_dummy, in_buff2_params,
+ q);
+ // Timed run - profile performance
+ double time = CrrSolver(n_crrs, in_buff_params, res_params,
+ in_buff2_params, q);
+ bool pass = true;
+
+ // Postprocessing step
+ // process_res used to compute final results
+ vector process_res(n_crrs);
+ ProcessKernelResult(res_params, process_res, n_crrs);
+
+ vector result(n_crrs);
+ for (int i = 0; i < n_crrs; ++i) {
+ result[i] = ComputeOutput(inp[i], in_params[i], process_res[i]);
+ TestCorrectness(i, n_crrs, pass, inp[i], in_params[i], result[i]);
+ }
+
+ // Write outputs data to output file
+ ofstream outputFile(outfilename);
+
+ WriteOutputToFile(outputFile, result);
+
+ TestThroughput(time, n_crrs);
+
+ } catch (sycl::exception const &e) {
+ std::cout << "Caught a synchronous SYCL exception: " << e.what() << "\n";
+ std::cout << " If you are targeting an FPGA hardware, "
+ "ensure that your system is plugged to an FPGA board that is "
+ "set up correctly\n";
+ std::cout << " If you are targeting the FPGA emulator, compile with "
+ "-DFPGA_EMULATOR\n";
+ return 1;
+ }
+ return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/CMakeLists.txt
new file mode 100755
index 0000000000..9ac77b0aff
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(GZip)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/README.md b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/README.md
new file mode 100755
index 0000000000..18117a82a5
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/README.md
@@ -0,0 +1,201 @@
+# GZIP Compression
+Reference design demonstrating high-performance GZIP compression on FPGA.
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. Additional reference material specific to this GZIP implementation is provided in the References section of this README.
+
+| Optimized for | Description
+--- |---
+| OS | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software | Intel® oneAPI DPC++ Compiler (Beta) Intel® FPGA Add-On for oneAPI Base Toolkit
+| What you will learn | How to implement a high performance multi-engine compression algorithm on FPGA
+| Time to complete | 1 hr (not including compile time)
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+**Performance**
+Please refer to performance disclaimer at the end of this README.
+
+| Device | Throughput
+|:--- |:---
+| Intel® PAC with Intel Arria® 10 GX FPGA | 1 engine @ 3.4 GB/s
+| Intel® PAC with Intel Stratix® 10 SX FPGA | 2 engines @ 5.5 GB/s each = 11.0 GB/s total
+
+
+## Purpose
+
+This DPC++ reference design implements a compression algorithm. The implementation is optimized for the FPGA device. The compression result is GZIP-compatible and can be decompressed with GUNZIP. The GZIP output file format is compatible with GZIP's DEFLATE algorithm, and follows a fixed subset of [RFC 1951](https://www.ietf.org/rfc/rfc1951.txt). See the References section for more specific references.
+
+The algorithm uses a GZIP-compatible Limpel-Ziv 77 (LZ77) algorithm for data de-duplication, and a GZIP-compatible Static Huffman algorithm for bit reduction. The implementation includes three FPGA accelerated tasks (LZ77, Static Huffman and CRC).
+
+The FPGA implementation of the algorithm enables either one or two independent GZIP compute engines to operate in parallel on the FPGA. The number of engines is constrained by the available FPGA resources. By default, the design is parameterized to create a single engine when the design is compiled targeting Intel® PAC with Intel Arria® 10 GX FPGA. Two engines are created when targeting Intel® PAC with Intel Stratix® 10 SX FPGA, a larger device.
+
+## Key Implementation Details
+
+ | Kernel | Description
+--- |---
+| LZ Reduction | Implements a LZ77 algorithm for data de-duplication. The algorithm produces distance and length information that is compatible with GZIP's DEFLATE implementation.
+| Static Huffman | Uses the same Static Huffman codes used by GZIP's DEFLATE algorithm when it chooses a Static Huffman coding scheme for bit reduction. This choice maintains compatibility with GUNZIP.
+| CRC | Adds a CRC checksum based on the input file; this is required by the gzip file format
+
+To optimize performance, GZIP leverages techniques discussed in the following FPGA tutorials:
+* **Double Buffering to Overlap Kernel Execution with Buffer Transfers and Host Processing** (double_buffering)
+* **On-Chip Memory Attributes** (mem_config)
+
+
+## License
+This code sample is licensed under MIT license.
+
+
+## Building the `gzip` Reference Design
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 24h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+ ```
+ mkdir build
+ cd build
+ ```
+ To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:
+ ```
+ cmake ..
+ ```
+ Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+ ```
+ cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+ ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ make fpga_emu
+ ```
+ * Generate the optimization report:
+ ```
+ make report
+ ```
+ * Compile for FPGA hardware (longer compile time, targets FPGA device):
+ ```
+ make fpga
+ ```
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here .
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead.
+
+1. Enter the source file directory.
+ ```
+ cd src
+ ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ ninja fpga_emu
+ ```
+
+ * Generate the optimization report:
+
+ ```
+ ninja report
+ ```
+ If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+ ```
+ ninja report_s10_pac
+ ```
+ * Compiling for FPGA hardware is not yet supported on Windows.
+
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+
+## Running the Reference Design
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+ ```
+ ./gzip.fpga_emu [-o=] (Linux)
+ gzip.fpga_emu.exe [-o=] (Windows)
+ ```
+2. Run the sample on the FPGA device:
+ ```
+ ./gzip.fpga [-o=] (Linux)
+ ```
+ ### Application Parameters
+
+| Argument | Description
+--- |---
+| `` | Mandatory argument that specifies the file to be compressed. Use a 120+ MB file to achieve peak performance.
+| `-o=` | Optional argument that specifies the name of the output file. The default name of the output file is `.gz`. When targeting Intel Stratix® 10 SX, the single `` is fed to both engines, yielding two identical output files, using `` as the basis for the filenames.
+
+### Example of Output
+
+```
+Running on device: pac_a10 : Intel PAC Platform (pac_ee00000)
+Throughput: 3.4321 GB/s
+Compression Ratio 33.2737%
+PASSED
+```
+## Additional Design Information
+### Source Code Explanation
+
+| File | Description
+--- |---
+| `gzip.cpp` | Contains the `main()` function and the top-level interfaces to the SYCL* GZIP functions.
+| `gzipkernel.cpp` | Contains the SYCL* kernels used to implement GZIP.
+| `CompareGzip.cpp` | Contains code to compare a GZIP-compatible file with the original input.
+| `WriteGzip.cpp` | Contains code to write a GZIP compatible file.
+| `crc32.cpp` | Contains code to calculate a 32-bit CRC that is compatible with the GZIP file format and to combine multiple 32-bit CRC values. It is used to account only for the CRC of the last few bytes in the file, which are not processed by the accelerated CRC kernel.
+| `kernels.hpp` | Contains miscellaneous defines and structure definitions required by the LZReduction and Static Huffman kernels.
+| `crc32.hpp` | Header file for `crc32.cpp`.
+| `gzipkernel.hpp` | Header file for `gzipkernels.cpp`.
+| `CompareGzip.hpp` | Header file for `CompareGzip.cpp`.
+| `WriteGzip.hpp` | Header file for `WriteGzip.cpp`.
+
+### Compiler Flags Used
+
+| Flag | Description
+--- |---
+`-Xshardware` | Target FPGA hardware (as opposed to FPGA emulator)
+`-Xsparallel=2` | Uses 2 cores when compiling the bitstream through Quartus
+`-Xsseed=1` | Uses seed 1 during Quartus, yields slightly higher fmax
+`-Xsnum-reorder=6` | On Intel Stratix® 10 SX only, specify a wider data path for read data from global memory
+`-DNUM_ENGINES=<1|2>` | Specifies that 1 GZIP engine should be compiled when targeting Arria® 10 GX and 2 engines when targeting Intel Stratix® 10 SX
+
+
+### Performance disclaimers
+
+Tests document performance of components on a particular test, in specific systems. Differences in hardware, software, or configuration will affect actual performance. Consult other sources of information to evaluate performance as you consider your purchase. For more complete information about performance and benchmark results, visit [www.intel.com/benchmarks](www.intel.com/benchmarks).
+
+Performance results are based on testing as of July 29, 2020 and may not reflect all publicly available security updates. See configuration disclosure for details. No product or component can be absolutely secure.
+
+Intel technologies’ features and benefits depend on system configuration and may require enabled hardware, software or service activation. Performance varies depending on system configuration. Check with your system manufacturer or retailer or learn more at [intel.com](www.intel.com).
+
+The performance was measured by Intel on July 29, 2020
+
+Intel and the Intel logo are trademarks of Intel Corporation or its subsidiaries in the U.S. and/or other countries.
+
+(C) Intel Corporation.
+
+### References
+[Khronous SYCL Resources](https://www.khronos.org/sycl/resources)
+
+[Intel GZIP OpenCL Design Example](https://www.intel.com/content/www/us/en/programmable/support/support-resources/design-examples/design-software/opencl/gzip-compression.html)
+
+[RFC 1951 - DEFLATE Data Format](https://www.ietf.org/rfc/rfc1951.txt)
+
+[RFC 1952 - GZIP Specification 4.3](https://www.ietf.org/rfc/rfc1952.txt)
+
+[OpenCL Intercept Layer](https://github.com/intel/opencl-intercept-layer)
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/Zlib_License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/Zlib_License.txt
new file mode 100755
index 0000000000..a75dd96a90
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/Zlib_License.txt
@@ -0,0 +1,25 @@
+zlib License
+
+ zlib.h -- interface of the 'zlib' general purpose compression library
+ version 1.2.11, January 15th, 2017
+
+ Copyright (C) 1995-2017 Jean-loup Gailly and Mark Adler
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ Jean-loup Gailly Mark Adler
+ jloup@gzip.org madler@alumni.caltech.edu
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.sln b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.sln
new file mode 100755
index 0000000000..580f35f08b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.sln
@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gzip", "gzip.vcxproj", "{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|x64 = Debug|x64
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Debug|x64.ActiveCfg = Debug|x64
+ {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Debug|x64.Build.0 = Debug|x64
+ {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Release|x64.ActiveCfg = Release|x64
+ {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ SolutionGuid = {92BEFAAB-0365-4E5A-9C4A-E50AB49B2A6B}
+ EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj
new file mode 100755
index 0000000000..cf6a2462d2
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj
@@ -0,0 +1,174 @@
+
+
+
+
+ Debug
+ x64
+
+
+ Release
+ x64
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 15.0
+ {cf6a576b-665d-4f24-bb62-0dae7a7b3c64}
+ Win32Proj
+ gzip
+ $(WindowsSDKVersion.Replace("\",""))
+
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true
+
+
+ true
+
+
+ false
+
+
+ false
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR %(AdditionalOptions)
+
+
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR %(AdditionalOptions)
+
+
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj.user b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj.user
new file mode 100755
index 0000000000..1956841792
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj.user
@@ -0,0 +1,14 @@
+
+
+
+ false
+
+
+ src/gzip.cpp -o=test.gz
+ WindowsLocalDebugger
+
+
+ src/gzip.cpp -o=test.gz
+ WindowsLocalDebugger
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/sample.json b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/sample.json
new file mode 100755
index 0000000000..a6d65ecd17
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/sample.json
@@ -0,0 +1,51 @@
+{
+ "guid": "D55081EB-669D-4832-BCE6-23EE2ACA9F0F",
+ "name": "GZIP Compression",
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Reference Designs"],
+ "description": "Reference design demonstrating high-performance GZIP compression on FPGA",
+ "toolchain": ["dpcpp"],
+ "os": ["linux", "windows"],
+ "builder": ["ide", "cmake"],
+ "targetDevice": ["FPGA"],
+ "languages": [{"cpp":{}}],
+ "ciTests": {
+ "linux": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make fpga_emu",
+ "./gzip.fpga_emu ../src/gzip.cpp -o=test.gz"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make report"
+ ]
+ }
+ ],
+ "windows": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "cd src",
+ "ninja fpga_emu",
+ "gzip.fpga_emu.exe ../src/gzip.cpp -o=test.gz"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "cd src",
+ "ninja report"
+ ]
+ }
+ ]
+ }
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CMakeLists.txt
new file mode 100755
index 0000000000..bf6125045f
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CMakeLists.txt
@@ -0,0 +1,125 @@
+set(DEVICE_SOURCE_FILE gzipkernel.cpp)
+set(DEVICE_HEADER_FILE gzipkernel.hpp)
+set(HOST_SOURCE_FILE gzip.cpp crc32.cpp WriteGzip.cpp CompareGzip.cpp)
+
+set(TARGET_NAME gzip)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+set(REPORTS_TARGET ${TARGET_NAME}_report)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Design specific constant values
+
+# To increase NUM_ENGINES to greater than 2, must also statically declare more engines in gzipkernel.cpp --> SubmitGzipTasks()
+set(NUM_ENGINES_A10 1)
+set(NUM_ENGINES_S10 2)
+set(NUM_REORDER "")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+SET(NUM_ENGINES ${NUM_ENGINES_A10})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+ MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+ MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+ MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+ SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+ SET(NUM_ENGINES ${NUM_ENGINES_S10})
+ set(NUM_REORDER "-Xsnum-reorder=6")
+
+ELSE()
+ MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+#specify -MMD -fsycl-link-targets=... instead of -fintelfpga to workaround known issue; lower report quality
+set(HARDWARE_COMPILE_FLAGS -MMD -fsycl-link-targets=spir64_fpga-unknown-unknown-sycldevice -c -DNUM_ENGINES=${NUM_ENGINES})
+
+# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation
+separate_arguments(USER_HARDWARE_FLAGS)
+set(HARDWARE_LINK_FLAGS -fintelfpga -Xshardware -Xsparallel=2 -Xsseed=1 ${NUM_REORDER} -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS} -DNUM_ENGINES=${NUM_ENGINES})
+set(FINAL_LINK_FLAGS -fintelfpga -DNUM_ENGINES=${NUM_ENGINES})
+
+set(EMULATOR_COMPILE_FLAGS "-v -v -v -g0 -fintelfpga -DFPGA_EMULATOR -DNUM_ENGINES=${NUM_ENGINES}")
+set(EMULATOR_LINK_FLAGS -fintelfpga)
+
+# fpga emulator
+if(WIN32)
+ set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+ add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+ separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+ add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} ${CMAKE_CURRENT_SOURCE_DIR}/gzip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/crc32.cpp ${CMAKE_CURRENT_SOURCE_DIR}/WriteGzip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/CompareGzip.cpp -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+ DEPENDS ${DEVICE_SOURCE_FILE} ${HOST_SOURCE_FILE})
+else()
+ add_executable(${EMULATOR_TARGET} ${DEVICE_SOURCE_FILE} ${HOST_SOURCE_FILE})
+ add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+ set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+ set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpga
+if(WIN32)
+ add_custom_target(fpga
+ COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+ add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+ set(DEVICE_FPGA_OBJ "gzipkernel_fpga.o")
+ set(DEVICE_IMAGE_FPGA_OBJ "gzipkernel_fpga.a")
+ set(HOST_SOURCE_FILES_WITH_PATH ${CMAKE_CURRENT_SOURCE_DIR}/gzip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/crc32.cpp ${CMAKE_CURRENT_SOURCE_DIR}/WriteGzip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/CompareGzip.cpp)
+
+ add_custom_command(OUTPUT ${DEVICE_FPGA_OBJ}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${DEVICE_FPGA_OBJ}
+ DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE})
+
+ set(OBJ_FILES)
+ foreach(HOST_FILE ${HOST_SOURCE_FILES_WITH_PATH})
+ set(HOST_FPGA_OBJ ${HOST_FILE}.o)
+ add_custom_command(OUTPUT ${HOST_FPGA_OBJ}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${HOST_FILE} -o ${HOST_FPGA_OBJ}
+ DEPENDS ${HOST_FILE})
+ list(APPEND OBJ_FILES ${HOST_FPGA_OBJ})
+ endforeach()
+
+ add_custom_command(OUTPUT ${DEVICE_IMAGE_FPGA_OBJ}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link=image ${DEVICE_FPGA_OBJ} -o ${DEVICE_IMAGE_FPGA_OBJ}
+ DEPENDS ${DEVICE_FPGA_OBJ} ${OBJ_FILES})
+
+ add_custom_command(OUTPUT ${FPGA_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${FINAL_LINK_FLAGS} ${OBJ_FILES} ${DEVICE_IMAGE_FPGA_OBJ} -o ${CMAKE_BINARY_DIR}/${FPGA_TARGET}
+ DEPENDS ${DEVICE_IMAGE_FPGA_OBJ} ${OBJ_FILES})
+endif()
+
+# fpga report
+if(WIN32)
+ add_custom_target(report DEPENDS ${REPORTS_TARGET} )
+
+ separate_arguments(WIN_FLAGS WINDOWS_COMMAND)
+ add_custom_command(OUTPUT ${REPORTS_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${WIN_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET}
+ DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE})
+
+else()
+ add_custom_target(report DEPENDS ${REPORTS_TARGET} )
+
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} ${DEVICE_SOURCE_FILE} COPYONLY)
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/kernels.hpp kernels.hpp COPYONLY)
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_HEADER_FILE} ${DEVICE_HEADER_FILE} COPYONLY)
+
+ add_custom_command(OUTPUT ${REPORTS_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${DEVICE_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET}
+ DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE} kernels.hpp)
+endif()
+
+# run
+add_custom_target(run
+ COMMAND ../${TARGET_NAME}.fpga_emu Makefile -o=test.gz
+ DEPENDS ${TARGET_NAME}.fpga_emu)
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.cpp
new file mode 100755
index 0000000000..b803dee96b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.cpp
@@ -0,0 +1,85 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#include "CompareGzip.hpp"
+
+// returns 0 on success, otherwise failure
+int CompareGzipFiles(
+ const std::string
+ &original_file, // original input file to compare gzip uncompressed
+ const std::string &input_gzfile) // gzip file to check
+{
+#ifdef _MSC_VER
+ std::cout
+ << "Info: skipping output verification on Windows, no builtin gunzip\n";
+ return 0;
+#else
+ //------------------------------------------------------------------
+ // assume all good to start with.
+
+ int gzipstatus = 0;
+
+ //------------------------------------------------------------------
+ // Create temporary output filename for gunzip
+
+ char tmp_name[] = "/tmp/gzip_fpga.XXXXXX";
+ mkstemp(tmp_name);
+ std::string outputfile = tmp_name;
+
+ //------------------------------------------------------------------
+ // Check that the original file and gzipped file exist.
+
+ //------------------------------------------------------------------
+ // gunzip the file produced to stdout, capturing to the temp file.
+
+ std::string cmd = "gunzip -c ";
+ cmd += input_gzfile;
+ cmd += " > " + outputfile;
+
+ int gzout = ::system(cmd.c_str());
+ if (gzout != 0) {
+ gzipstatus = 3;
+ }
+
+ //------------------------------------------------------------------
+ // diff the temp file and the original.
+
+ cmd = "diff -q " + outputfile + " " + original_file;
+ int diffout = ::system(cmd.c_str());
+ if (diffout != 0) {
+ gzipstatus = 4;
+ }
+
+ //------------------------------------------------------------------
+ // Cleanup, remove the temp file.
+
+ (void)::remove(outputfile.c_str());
+
+ return gzipstatus;
+#endif
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.hpp
new file mode 100755
index 0000000000..5624b97cea
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.hpp
@@ -0,0 +1,41 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#ifndef __COMPAREGZIP_H__
+#define __COMPAREGZIP_H__
+#pragma once
+
+#include
+#include
+
+int CompareGzipFiles(
+ const std::string
+ &original_file, // original input file to compare gzip uncompressed
+ const std::string &input_gzfile); // gzip file to check
+
+#endif //__COMPAREGZIP_H__
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.cpp
new file mode 100755
index 0000000000..71c370aa96
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.cpp
@@ -0,0 +1,163 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#define _CRT_SECURE_NO_WARNINGS
+#include "WriteGzip.hpp"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+constexpr int kDeflated = 8;
+#define GZIP_MAGIC "\037\213" // Magic header for gzip files, 1F 8B
+
+#define ORIG_NAME 0x08
+#define OS_CODE 0x03 // Unix OS_CODE
+
+typedef struct GzipHeader {
+ unsigned char magic[2]; // 0x1f, 0x8b
+ unsigned char compress_method; // 0-7 reserved, 8=deflate -- kDeflated
+ unsigned char flags; // b0: file probably ascii
+ // b1: header crc-16 present
+ // b2: extra field present
+ // b3: original file name present
+ // b4: file comment present
+ // b5,6,7: reserved
+ unsigned long time; // file modification time in Unix format.
+ // Set this to 0 for now.
+
+ unsigned char extra; // depends on compression method
+ unsigned char os; // operating system on which compression took place
+
+ // ...
+ // ? bytes ... compressd data ...
+
+ unsigned long crc;
+ unsigned long uncompressed_sz;
+
+} gzip_header, *pgzip_header;
+
+inline static void PutUlong(uint8_t *pc, unsigned long l) {
+ pc[0] = l & 0xff;
+ pc[1] = (l >> 8) & 0xff;
+ pc[2] = (l >> 16) & 0xff;
+ pc[3] = (l >> 24) & 0xff;
+}
+
+// returns 0 on success, otherwise failure
+int WriteBlockGzip(
+ std::string &original_filename, // Original file name being compressed
+ std::string &out_filename, // gzip filename
+ char *obuf, // pointer to compressed data block
+ size_t blen, // length of compressed data block
+ size_t ilen, // original block length
+ uint32_t buffer_crc) // the block's crc
+{
+ //------------------------------------------------------------------
+ // Setup the gzip output file header.
+ // max filename size is arbitrarily set to 256 bytes long
+ // Method is always DEFLATE
+ // Original filename is always set in header
+ // timestamp is set to 0 - ignored by gunzip
+ // deflate flags set to 0
+ // OS code is 0
+
+ int max_filename_sz = 256;
+
+ unsigned char *pgziphdr =
+ (unsigned char *)malloc(sizeof(gzip_header) + max_filename_sz);
+
+ if (!pgziphdr) {
+ std::cout << "pgzip header cannot be allocated\n";
+ return 1;
+ }
+
+ pgziphdr[0] = GZIP_MAGIC[0];
+ pgziphdr[1] = GZIP_MAGIC[1];
+ pgziphdr[2] = kDeflated;
+ pgziphdr[3] = ORIG_NAME;
+
+ // Set time in header to 0, this is ignored by gunzip.
+ pgziphdr[4] = 0;
+ pgziphdr[5] = 0;
+ pgziphdr[6] = 0;
+ pgziphdr[7] = 0;
+
+ // Deflate flags
+ pgziphdr[8] = 0;
+
+ // OS code is Linux in this case.
+ pgziphdr[9] = OS_CODE;
+
+ int ondx = 10;
+
+ const char *p = original_filename.c_str();
+ do {
+ pgziphdr[ondx++] = (*p);
+ } while (*p++);
+
+ int header_bytes = ondx;
+
+ unsigned char prolog[8];
+
+ PutUlong(((unsigned char *)prolog), buffer_crc);
+ PutUlong(((unsigned char *)&prolog[4]), ilen);
+
+ FILE *fo = fopen(out_filename.c_str(), "w+");
+ if (ferror(fo)) {
+ std::cout << "Cannot open file for output: " << out_filename << "\n";
+ free(pgziphdr);
+ return 1;
+ }
+
+ fwrite(pgziphdr, 1, header_bytes, fo);
+ fwrite(obuf, 1, blen, fo);
+ fwrite(prolog, 1, 8, fo);
+
+ if (ferror(fo)) {
+ std::cout << "gzip output file write failure.\n";
+ free(pgziphdr);
+ return 1;
+ }
+
+ if (fclose(fo)) {
+ perror("close");
+ free(pgziphdr);
+ return 1;
+ }
+ free(pgziphdr);
+ return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.hpp
new file mode 100755
index 0000000000..66bc28e315
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.hpp
@@ -0,0 +1,45 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#ifndef __WRITEGZIP_H__
+#define __WRITEGZIP_H__
+#pragma once
+
+#include
+#include
+
+// returns 0 on success, otherwise failure
+int WriteBlockGzip(
+ std::string &original_filename, // Original file name being compressed
+ std::string &out_filename, // gzip filename
+ char *obuf, // pointer to compressed data block
+ size_t blen, // length of compressed data block
+ size_t ilen, // original block length
+ uint32_t buffer_crc); // the block's crc
+
+#endif //__WRITEGZIP_H__
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/build.ninja b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/build.ninja
new file mode 100755
index 0000000000..29d50e63a0
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/build.ninja
@@ -0,0 +1,32 @@
+device_source_file = gzipkernel.cpp
+device_header_file = gzipkernel.h
+host_source_file = gzip.cpp crc32.cpp WriteGzip.cpp CompareGzip.cpp
+target_name = gzip
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware -Xsclock=280MHz -Xsparallel=2 -Xsseed=1
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+ command = dpcpp /GX ${emulator_flags} ${device_source_file} ${host_source_file} -o $out
+
+rule gen_report
+ command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 ${device_source_file} ${host_source_file} -fsycl-link -o $out
+
+rule gen_report_s10_pac
+ command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 ${device_source_file} ${host_source_file} -fsycl-link -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.cpp
new file mode 100755
index 0000000000..8e6c59c734
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.cpp
@@ -0,0 +1,126 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+/*
+ * Copyright (C) 1995-2006, 2010, 2011, 2012, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "crc32.hpp"
+
+// This table is CRC32s for all single byte values created by using the
+// makecrc.c utility from gzip for compatibility with gzip. makecrc.c can be
+// found in the gzip source code project found at
+// https://git.savannah.gnu.org/git/gzip.git. The polynomial 0xedb88320 is used
+// for gzip, and thus used to create this table.
+//
+// Not copyrighted 1990, Mark Adler.
+//
+const unsigned int crc32_table[] = {
+ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L,
+ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L,
+ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L,
+ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL,
+ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L,
+ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L,
+ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L,
+ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL,
+ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L,
+ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL,
+ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L,
+ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L,
+ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L,
+ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL,
+ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL,
+ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L,
+ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL,
+ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L,
+ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L,
+ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L,
+ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL,
+ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L,
+ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L,
+ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL,
+ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L,
+ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L,
+ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L,
+ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L,
+ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L,
+ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL,
+ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL,
+ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L,
+ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L,
+ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL,
+ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL,
+ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L,
+ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL,
+ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L,
+ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL,
+ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L,
+ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL,
+ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L,
+ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L,
+ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL,
+ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L,
+ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L,
+ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L,
+ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L,
+ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L,
+ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L,
+ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
+ 0x2d02ef8dL};
+
+//
+// This routine creates a Crc32 from a memory buffer (address, and length), and
+// a previous crc. This routine can be called iteratively on different portions
+// of the same buffer, using a previously returned crc value. The
+// value 0xffffffff is used for the first buffer invocation.
+unsigned int Crc32Host(
+ const char *pbuf, // pointer to the buffer to crc
+ size_t sz, // number of bytes
+ unsigned int previous_crc) // previous CRC, allows combining.
+{
+ unsigned int curr_crc = ~previous_crc;
+ if (sz) do {
+ curr_crc =
+ crc32_table[((int)curr_crc ^ (*pbuf++)) & 0xff] ^ (curr_crc >> 8);
+ } while (--sz);
+ return curr_crc ^ 0xffffffffL;
+}
+
+unsigned int Crc32(const char *in, size_t buffer_sz,
+ unsigned int previous_crc) {
+ const int num_nibbles_parallel = 64;
+ const int num_sections =
+ buffer_sz / (num_nibbles_parallel / 2); // how many loop iterations
+ // now deal with the remainder, this should be done on the software host
+ // the post-invert also happens inside crc_reference
+ const char *remaining_data = &in[num_sections * (num_nibbles_parallel / 2)];
+ int remaining_bytes = buffer_sz % (num_nibbles_parallel / 2);
+ return Crc32Host(remaining_data, remaining_bytes, previous_crc);
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.hpp
new file mode 100755
index 0000000000..138a8f0754
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.hpp
@@ -0,0 +1,46 @@
+// ==============================================================
+// Copyright Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#ifndef __CRC32_H__
+#define __CRC32_H__
+#pragma once
+
+#include
+#include
+
+uint32_t Crc32Host(
+ const char *pbuf, // pointer to the buffer to crc
+ size_t sz, // number of bytes
+ uint32_t previous_crc); // previous CRC, allows combining. First invocation
+ // would use 0xffffffff.
+uint32_t Crc32(const char *pbuf, // pointer to the buffer to crc
+ size_t sz, // number of bytes
+ uint32_t previous_crc); // previous CRC, allows combining. First
+ // invocation would use 0xffffffff.
+
+#endif //__CRC32_H__
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzip.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzip.cpp
new file mode 100755
index 0000000000..9ecfe11728
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzip.cpp
@@ -0,0 +1,520 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#include
+#include
+#include
+#include
+#include
+
+#include "CompareGzip.hpp"
+#include "WriteGzip.hpp"
+#include "crc32.hpp"
+#include "dpc_common.hpp"
+#include "gzipkernel.hpp"
+#include "kernels.hpp"
+
+using namespace sycl;
+
+// The minimum file size of a file to be compressed.
+// Any filesize less than this results in an error.
+constexpr int minimum_filesize = kVec + 1;
+
+bool help = false;
+
+int CompressFile(queue &q, std::string &input_file, std::vector outfilenames,
+ int iterations, bool report);
+
+void Help(void) {
+ // Command line arguments.
+ // gzip [options] filetozip [options]
+ // -h,--help : help
+
+ // future options?
+ // -p,performance : output perf metrics
+ // -m,maxmapping=# : maximum mapping size
+
+ std::cout << "gzip filename [options]\n";
+ std::cout << " -h,--help : this help text\n";
+ std::cout
+ << " -o=,--output-file= : specify output file\n";
+}
+
+bool FindGetArg(std::string &arg, const char *str, int defaultval, int *val) {
+ std::size_t found = arg.find(str, 0, strlen(str));
+ if (found != std::string::npos) {
+ int value = atoi(&arg.c_str()[strlen(str)]);
+ *val = value;
+ return true;
+ }
+ return false;
+}
+
+constexpr int kMaxStringLen = 40;
+
+bool FindGetArgString(std::string &arg, const char *str, char *str_value,
+ size_t maxchars) {
+ std::size_t found = arg.find(str, 0, strlen(str));
+ if (found != std::string::npos) {
+ const char *sptr = &arg.c_str()[strlen(str)];
+ for (int i = 0; i < maxchars - 1; i++) {
+ char ch = sptr[i];
+ switch (ch) {
+ case ' ':
+ case '\t':
+ case '\0':
+ str_value[i] = 0;
+ return true;
+ break;
+ default:
+ str_value[i] = ch;
+ break;
+ }
+ }
+ return true;
+ }
+ return false;
+}
+
+size_t SyclGetExecTimeNs(event e) {
+ size_t start_time =
+ e.get_profiling_info();
+ size_t end_time =
+ e.get_profiling_info();
+ return (end_time - start_time);
+}
+
+int main(int argc, char *argv[]) {
+ std::string infilename = "";
+
+ std::vector outfilenames (kNumEngines);
+
+ char str_buffer[kMaxStringLen] = {0};
+
+ // Check the number of arguments specified
+ if (argc != 3) {
+ std::cerr << "Incorrect number of arguments. Correct usage: " << argv[0]
+ << " -o=\n";
+ return 1;
+ }
+
+ for (int i = 1; i < argc; i++) {
+ if (argv[i][0] == '-') {
+ std::string sarg(argv[i]);
+ if (std::string(argv[i]) == "-h") {
+ help = true;
+ }
+ if (std::string(argv[i]) == "--help") {
+ help = true;
+ }
+
+ FindGetArgString(sarg, "-o=", str_buffer, kMaxStringLen);
+ FindGetArgString(sarg, "--output-file=", str_buffer, kMaxStringLen);
+ } else {
+ infilename = std::string(argv[i]);
+ }
+ }
+
+ if (help) {
+ Help();
+ return 1;
+ }
+
+ try {
+#ifdef FPGA_EMULATOR
+ intel::fpga_emulator_selector device_selector;
+#else
+ intel::fpga_selector device_selector;
+#endif
+ auto prop_list = property_list{property::queue::enable_profiling()};
+ queue q(device_selector, dpc_common::exception_handler, prop_list);
+
+ std::cout << "Running on device: "
+ << q.get_device().get_info().c_str() << "\n";
+
+ if (infilename == "") {
+ std::cout << "Must specify a filename to compress\n\n";
+ Help();
+ return 1;
+ }
+
+ // next, check valid and acceptable parameter ranges.
+ // if output filename not set, use the default
+ // name, else use the name specified by the user
+ outfilenames[0] = std::string(infilename) + ".gz";
+ if (strlen(str_buffer)) {
+ outfilenames[0] = std::string(str_buffer);
+ }
+ for (size_t i=1; i< kNumEngines; i++) {
+ // Filenames will be of the form outfilename, outfilename2, outfilename3 etc.
+ outfilenames[i] = outfilenames[0] + std::to_string(i+1);
+ }
+
+ std::cout << "Launching GZIP application with " << kNumEngines
+ << " engines\n";
+
+#ifdef FPGA_EMULATOR
+ CompressFile(q, infilename, outfilenames, 1, true);
+#else
+ // warmup run - use this run to warmup accelerator. There are some steps in
+ // the runtime that are only executed on the first kernel invocation but not
+ // on subsequent invocations. So execute all that stuff here before we
+ // measure performance (in the next call to CompressFile().
+ CompressFile(q, infilename, outfilenames, 1, false);
+ // profile performance
+ CompressFile(q, infilename, outfilenames, 200, true);
+#endif
+ } catch (sycl::exception const &e) {
+ // Catches exceptions in the host code
+ std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+
+ // Most likely the runtime couldn't find FPGA hardware!
+ if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+ std::cout << "If you are targeting an FPGA, please ensure that your "
+ "system has a correctly configured FPGA board.\n";
+ std::cout << "If you are targeting the FPGA emulator, compile with "
+ "-DFPGA_EMULATOR.\n";
+ }
+ std::terminate();
+ }
+ return 0;
+}
+
+struct KernelInfo {
+ buffer *gzip_out_buf;
+ buffer *current_crc;
+ buffer *pobuf;
+ buffer *pibuf;
+ char *pobuf_decompress;
+
+ uint32_t buffer_crc[kMinBufferSize];
+ uint32_t refcrc;
+
+ const char *pref_buffer;
+ char *poutput_buffer;
+ size_t file_size;
+ struct GzipOutInfo out_info[kMinBufferSize];
+ int iteration;
+ bool last_block;
+};
+
+// returns 0 on success, otherwise a non-zero failure code.
+int CompressFile(queue &q, std::string &input_file, std::vector outfilenames,
+ int iterations, bool report) {
+ size_t isz;
+ char *pinbuf;
+
+ // Read the input file
+ std::string device_string =
+ q.get_device().get_info().c_str();
+ bool prepin =
+ (device_string.find("s10") !=
+ std::string::npos); // Check if "s10" is found in the device string. If
+ // the device is S10, we pre-pin some buffers to
+ // improve DMA performance, which is needed to
+ // achieve peak kernel throughput. Pre-pinning is
+ // only supported on the PAC-S10 BSP. It's not
+ // needed on PAC-A10 to achieve peak performance.
+
+ std::ifstream file(input_file,
+ std::ios::in | std::ios::binary | std::ios::ate);
+ if (file.is_open()) {
+ isz = file.tellg();
+ if (prepin) {
+ pinbuf = (char *)malloc_host(
+ isz, q.get_context()); // Pre-pin the buffer, for faster DMA
+ } else { // throughput, using malloc_host().
+ pinbuf = new char[isz];
+ }
+ file.seekg(0, std::ios::beg);
+ file.read(pinbuf, isz);
+ file.close();
+ } else {
+ std::cout << "Error: cannot read specified input file\n";
+ return 1;
+ }
+
+ if (isz < minimum_filesize) {
+ std::cout << "Minimum filesize for compression is " << minimum_filesize
+ << "\n";
+ return 1;
+ }
+
+ int buffers_count = iterations;
+
+ // Create an array of kernel info structures and create buffers for kernel
+ // input/output. The buffers are re-used between iterations, but enough
+ // disjoint buffers are created to support double-buffering.
+ struct KernelInfo *kinfo[kNumEngines];
+ for (size_t eng = 0; eng < kNumEngines; eng++) {
+ kinfo[eng] =
+ (struct KernelInfo *)malloc(sizeof(struct KernelInfo) * buffers_count);
+ if (kinfo[eng] == NULL) {
+ std::cout << "Cannot allocate kernel info buffer.\n";
+ return 1;
+ }
+ for (int i = 0; i < buffers_count; i++) {
+ kinfo[eng][i].file_size = isz;
+ // Allocating slightly larger buffers (+ 16 * kVec) to account for
+ // granularity of kernel writes
+ int outputSize = kinfo[eng][i].file_size + 16 * kVec < kMinBufferSize
+ ? kMinBufferSize
+ : kinfo[eng][i].file_size + 16 * kVec;
+
+ // Pre-pin buffer using malloc_host() to improve DMA bandwidth.
+ if (i >= 3) {
+ kinfo[eng][i].poutput_buffer = kinfo[eng][i - 3].poutput_buffer;
+ } else {
+ if (prepin) {
+ kinfo[eng][i].poutput_buffer =
+ (char *)malloc_host(outputSize, q.get_context());
+ } else {
+ kinfo[eng][i].poutput_buffer = (char *)malloc(outputSize);
+ }
+ if (kinfo[eng][i].poutput_buffer == NULL) {
+ std::cout << "Cannot allocate output buffer.\n";
+ free(kinfo);
+ return 1;
+ }
+ // zero pages to fully allocate them
+ memset(kinfo[eng][i].poutput_buffer, 0, outputSize);
+ }
+
+ kinfo[eng][i].last_block = true;
+ kinfo[eng][i].iteration = i;
+ kinfo[eng][i].pref_buffer = pinbuf;
+
+ kinfo[eng][i].gzip_out_buf =
+ i >= 3 ? kinfo[eng][i - 3].gzip_out_buf
+ : new buffer(kMinBufferSize);
+ kinfo[eng][i].current_crc = i >= 3
+ ? kinfo[eng][i - 3].current_crc
+ : new buffer(kMinBufferSize);
+ kinfo[eng][i].pibuf = i >= 3
+ ? kinfo[eng][i - 3].pibuf
+ : new buffer(kinfo[eng][i].file_size);
+ kinfo[eng][i].pobuf =
+ i >= 3 ? kinfo[eng][i - 3].pobuf : new buffer(outputSize);
+ kinfo[eng][i].pobuf_decompress = (char *)malloc(kinfo[eng][i].file_size);
+ }
+ }
+
+ // Create events for the various parts of the execution so that we can profile
+ // their performance.
+ event e_input_dma [kNumEngines][buffers_count]; // Input to the GZIP engine. This is a transfer from host to device.
+ event e_output_dma [kNumEngines][buffers_count]; // Output from the GZIP engine. This is transfer from device to host.
+ event e_crc_dma [kNumEngines][buffers_count]; // Transfer CRC from device to host
+ event e_size_dma [kNumEngines][buffers_count]; // Transfer compressed file size from device to host
+ event e_k_crc [kNumEngines][buffers_count]; // CRC kernel
+ event e_k_lz [kNumEngines][buffers_count]; // LZ77 kernel
+ event e_k_huff [kNumEngines][buffers_count]; // Huffman Encoding kernel
+
+#ifndef FPGA_EMULATOR
+ dpc_common::TimeInterval perf_timer;
+#endif
+
+
+ /*************************************************/
+ /* Main loop where the actual execution happens */
+ /*************************************************/
+ for (int i = 0; i < buffers_count; i++) {
+ for (size_t eng = 0; eng < kNumEngines; eng++) {
+ // Transfer the input data, to be compressed, from host to device.
+ e_input_dma[eng][i] = q.submit([&](handler &h) {
+ auto in_data =
+ kinfo[eng][i].pibuf->get_access(h);
+ h.copy(kinfo[eng][i].pref_buffer, in_data);
+ });
+
+ /************************************/
+ /************************************/
+ /* LAUNCH GZIP ENGINE */
+ /************************************/
+ /************************************/
+ SubmitGzipTasks(q, kinfo[eng][i].file_size, kinfo[eng][i].pibuf,
+ kinfo[eng][i].pobuf, kinfo[eng][i].gzip_out_buf,
+ kinfo[eng][i].current_crc, kinfo[eng][i].last_block,
+ e_k_crc[eng][i], e_k_lz[eng][i], e_k_huff[eng][i], eng);
+
+ // Transfer the output (compressed) data from device to host.
+ e_output_dma[eng][i] = q.submit([&](handler &h) {
+ auto out_data = kinfo[eng][i].pobuf->get_access(h);
+ h.copy(out_data, kinfo[eng][i].poutput_buffer);
+ });
+
+ // Transfer the file size of the compressed output file from device to host.
+ e_size_dma[eng][i] = q.submit([&](handler &h) {
+ auto out_data =
+ kinfo[eng][i].gzip_out_buf->get_access(h);
+ h.copy(out_data, kinfo[eng][i].out_info);
+ });
+
+ // Transfer the CRC of the compressed output file from device to host.
+ e_crc_dma[eng][i] = q.submit([&](handler &h) {
+ auto out_data =
+ kinfo[eng][i].current_crc->get_access(h);
+ h.copy(out_data, kinfo[eng][i].buffer_crc);
+ });
+ }
+ }
+
+ // Wait for all kernels to complete
+ for (int eng = 0; eng < kNumEngines; eng++) {
+ for (int i = 0; i < buffers_count; i++) {
+ e_output_dma[eng][i].wait();
+ e_size_dma[eng][i].wait();
+ e_crc_dma[eng][i].wait();
+ }
+ }
+
+// Stop the timer.
+#ifndef FPGA_EMULATOR
+ double diff_total = perf_timer.Elapsed();
+ double gbps = iterations * isz / (double)diff_total / 1000000000.0;
+#endif
+
+ // Check the compressed file size from each iteration. Make sure the size is actually
+ // less-than-or-equal to the input size. Also calculate the remaining CRC.
+ size_t compressed_sz[kNumEngines];
+ for (int eng = 0; eng < kNumEngines; eng++) {
+ compressed_sz[eng] = 0;
+ for (int i = 0; i < buffers_count; i++) {
+ if (kinfo[eng][i].out_info[0].compression_sz > kinfo[eng][i].file_size) {
+ std::cerr << "Unsupported: compressed file larger than input file( "
+ << kinfo[eng][i].out_info[0].compression_sz << " )\n";
+ return 1;
+ }
+ // The majority of the CRC is calculated by the CRC kernel on the FPGA. But the kernel
+ // operates on quantized chunks of input data, so any remaining input data, that falls
+ // outside the quanta, is included in the overall CRC calculation via the following
+ // function that runs on the host. The last argument is the running CRC that was computed
+ // on the FPGA.
+ kinfo[eng][i].buffer_crc[0] =
+ Crc32(kinfo[eng][i].pref_buffer, kinfo[eng][i].file_size,
+ kinfo[eng][i].buffer_crc[0]);
+ // Accumulate the compressed size across all iterations. Used to
+ // compute compression ratio later.
+ compressed_sz[eng] += kinfo[eng][i].out_info[0].compression_sz;
+ }
+ }
+
+ // delete the file mapping now that all kernels are complete, and we've
+ // snapped the time delta
+ if (prepin) {
+ free(pinbuf, q.get_context());
+ } else {
+ delete pinbuf;
+ }
+
+ // Write the output compressed data from the first iteration of each engine, to a file.
+ for (int eng = 0; eng < kNumEngines; eng++) {
+ // WriteBlockGzip() returns 1 on failure
+ if (report && WriteBlockGzip(input_file, outfilenames[eng], kinfo[eng][0].poutput_buffer,
+ kinfo[eng][0].out_info[0].compression_sz,
+ kinfo[eng][0].file_size, kinfo[eng][0].buffer_crc[0])) {
+ std::cout << "FAILED\n";
+ return 1;
+ }
+ }
+
+ // Decompress the output from engine-0 and compare against the input file. Only engine-0's
+ // output is verified since all engines are fed the same input data.
+ if (report && CompareGzipFiles(input_file, outfilenames[0])) {
+ std::cout << "FAILED\n";
+ return 1;
+ }
+
+ // Generate throughput report
+ // First gather all the execution times.
+ size_t time_k_crc[kNumEngines];
+ size_t time_k_lz[kNumEngines];
+ size_t time_k_huff[kNumEngines];
+ size_t time_input_dma[kNumEngines];
+ size_t time_output_dma[kNumEngines];
+ for (int eng = 0; eng < kNumEngines; eng++) {
+ time_k_crc[eng] = 0;
+ time_k_lz[eng] = 0;
+ time_k_huff[eng] = 0;
+ time_input_dma[eng] = 0;
+ time_output_dma[eng] = 0;
+ for (int i = 0; i < buffers_count; i++) {
+ e_k_crc[eng][i].wait();
+ e_k_lz[eng][i].wait();
+ e_k_huff[eng][i].wait();
+ time_k_crc[eng] += SyclGetExecTimeNs(e_k_crc[eng][i]);
+ time_k_lz[eng] += SyclGetExecTimeNs(e_k_lz[eng][i]);
+ time_k_huff[eng] += SyclGetExecTimeNs(e_k_huff[eng][i]);
+ time_input_dma[eng] += SyclGetExecTimeNs(e_input_dma[eng][i]);
+ time_output_dma[eng] += SyclGetExecTimeNs(e_output_dma[eng][i]);
+ }
+ }
+
+ if (report) {
+ double compression_ratio =
+ (double)((double)compressed_sz[0] / (double)isz / iterations);
+#ifndef FPGA_EMULATOR
+ std::cout << "Throughput: " << kNumEngines * gbps << " GB/s\n\n";
+ for (int eng = 0; eng < kNumEngines; eng++) {
+ std::cout << "TP breakdown for engine #" << eng << " (GB/s)\n";
+ std::cout << "CRC = " << iterations * isz / (double)time_k_crc[eng]
+ << "\n";
+ std::cout << "LZ77 = " << iterations * isz / (double)time_k_lz[eng]
+ << "\n";
+ std::cout << "Huffman Encoding = "
+ << iterations * isz / (double)time_k_huff[eng] << "\n";
+ std::cout << "DMA host-to-device = "
+ << iterations * isz / (double)time_input_dma[eng] << "\n";
+ std::cout << "DMA device-to-host = "
+ << iterations * isz / (double)time_output_dma[eng] << "\n\n";
+ }
+#endif
+ std::cout << "Compression Ratio " << compression_ratio * 100 << "%\n";
+ }
+
+ // Cleanup anything that was allocated by this routine.
+ for (int eng = 0; eng < kNumEngines; eng++) {
+ for (int i = 0; i < buffers_count; i++) {
+ if (i < 3) {
+ delete kinfo[eng][i].gzip_out_buf;
+ delete kinfo[eng][i].current_crc;
+ delete kinfo[eng][i].pibuf;
+ delete kinfo[eng][i].pobuf;
+ if (prepin) {
+ free(kinfo[eng][i].poutput_buffer, q.get_context());
+ } else {
+ free(kinfo[eng][i].poutput_buffer);
+ }
+ }
+ free(kinfo[eng][i].pobuf_decompress);
+ }
+ free(kinfo[eng]);
+ }
+
+ if (report) std::cout << "PASSED\n";
+ return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.cpp
new file mode 100755
index 0000000000..01d69c1f9b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.cpp
@@ -0,0 +1,2406 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+/*
+ * Copyright (C) 1995-2006, 2010, 2011, 2012, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include
+
+#include "gzipkernel.hpp"
+#include "kernels.hpp"
+
+using namespace sycl;
+
+// This reference design uses a template-based unroller. It's also possible
+// to specify this in a more concise way using a pragma. See the loop unroll
+// tutorial for more information.
+template
+struct Unroller {
+ template
+ static void step(const Action &action) {
+ action(Begin);
+ Unroller::step(action);
+ }
+};
+
+template
+struct Unroller {
+ template
+ static void step(const Action &action) {}
+};
+
+int GetHuffLiteralBits(unsigned char ch) {
+ CtData static_ltree[kLCodes + 2] = {
+ {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8},
+ {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8},
+ {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8},
+ {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8},
+ {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8},
+ {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8},
+ {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8},
+ {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8},
+ {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8},
+ {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8},
+ {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8},
+ {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8},
+ {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8},
+ {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8},
+ {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8},
+ {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8},
+ {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8},
+ {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8},
+ {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8},
+ {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8},
+ {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9},
+ {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9},
+ {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+ {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9},
+ {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+ {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9},
+ {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9},
+ {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9},
+ {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9},
+ {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9},
+ {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9},
+ {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+ {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9},
+ {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+ {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9},
+ {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9},
+ {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7},
+ {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7},
+ {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7},
+ {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7},
+ {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8},
+ {227, 8},
+ };
+ return static_ltree[ch].code;
+}
+
+int GetHuffLiteralLen(unsigned char ch) {
+ CtData static_ltree[kLCodes + 2] = {
+ {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8},
+ {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8},
+ {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8},
+ {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8},
+ {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8},
+ {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8},
+ {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8},
+ {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8},
+ {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8},
+ {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8},
+ {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8},
+ {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8},
+ {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8},
+ {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8},
+ {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8},
+ {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8},
+ {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8},
+ {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8},
+ {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8},
+ {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8},
+ {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9},
+ {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9},
+ {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+ {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9},
+ {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+ {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9},
+ {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9},
+ {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9},
+ {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9},
+ {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9},
+ {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9},
+ {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+ {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9},
+ {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+ {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9},
+ {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9},
+ {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7},
+ {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7},
+ {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7},
+ {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7},
+ {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8},
+ {227, 8},
+ };
+ return static_ltree[ch].len;
+}
+
+int GetHuffRunLen(int len, int initial_dist) {
+ int lc;
+ unsigned code;
+ int extra;
+ int dist;
+ int local_lbits, local_llen;
+ int local_dbits, local_dlen;
+ local_lbits = 0;
+ local_llen = 0;
+
+ int base_length[kLengthCodes] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24,
+ 28, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 0,
+ };
+
+ int extra_lbits[kLengthCodes] // extra bits for each length code
+ = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
+ 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0};
+
+ CtData static_ltree[kLCodes + 2] = {
+ {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8},
+ {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8},
+ {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8},
+ {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8},
+ {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8},
+ {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8},
+ {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8},
+ {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8},
+ {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8},
+ {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8},
+ {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8},
+ {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8},
+ {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8},
+ {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8},
+ {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8},
+ {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8},
+ {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8},
+ {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8},
+ {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8},
+ {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8},
+ {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9},
+ {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9},
+ {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+ {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9},
+ {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+ {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9},
+ {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9},
+ {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9},
+ {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9},
+ {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9},
+ {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9},
+ {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+ {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9},
+ {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+ {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9},
+ {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9},
+ {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7},
+ {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7},
+ {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7},
+ {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7},
+ {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8},
+ {227, 8},
+ };
+
+ // distance codes. The first 256 values correspond to the distances
+ // 3 .. 258, the last 256 values correspond to the top 8 bits of
+ // the 15 bit distances.
+ unsigned char dist_code[512] = {
+ 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8,
+ 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 0, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21,
+ 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29,
+ };
+ // length code for each normalized match length (0 == kMinMatch)
+ unsigned char length_code[kMaxMatch - kMinMatch + 1] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
+ 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16,
+ 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18,
+ 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+ 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 28,
+ };
+
+ int extra_dbits[kDCodes] // extra bits for each distance code
+ = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
+
+ int base_dist[kDCodes] = {
+ 0, 1, 2, 3, 4, 6, 8, 12, 16, 24,
+ 32, 48, 64, 96, 128, 192, 256, 384, 512, 768,
+ 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576,
+ };
+
+ CtData static_dtree[kDCodes] = {
+ {0, 5}, {16, 5}, {8, 5}, {24, 5}, {4, 5}, {20, 5}, {12, 5}, {28, 5},
+ {2, 5}, {18, 5}, {10, 5}, {26, 5}, {6, 5}, {22, 5}, {14, 5}, {30, 5},
+ {1, 5}, {17, 5}, {9, 5}, {25, 5}, {5, 5}, {21, 5}, {13, 5}, {29, 5},
+ {3, 5}, {19, 5}, {11, 5}, {27, 5}, {7, 5}, {23, 5},
+ };
+
+ lc = len - kMinMatch;
+ code = length_code[lc];
+
+ local_lbits = static_ltree[code + kLiterals + 1].code;
+ local_llen = static_ltree[code + kLiterals + 1].len;
+ extra = extra_lbits[code];
+ if (extra) {
+ lc -= base_length[code];
+ local_lbits |= lc << local_llen;
+ local_llen += extra;
+ }
+
+ dist = initial_dist;
+ dist--;
+ code = d_code(dist);
+ local_dbits = static_dtree[code].code;
+ local_dlen = static_dtree[code].len;
+ extra = extra_dbits[code];
+ if (extra) {
+ dist -= base_dist[code];
+ local_dbits |= dist << local_dlen;
+ local_dlen += extra;
+ }
+
+ local_lbits |= local_dbits << local_llen;
+ local_llen += local_dlen;
+
+ return local_llen;
+}
+
+int GetHuffRunBits(int len, int initial_dist) {
+ int lc;
+ unsigned code;
+ int extra;
+ int dist;
+ int local_lbits, local_llen;
+ int local_dbits, local_dlen;
+ local_lbits = 0;
+ local_llen = 0;
+
+ int base_length[kLengthCodes] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24,
+ 28, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 0,
+ };
+
+ int extra_lbits[kLengthCodes] // extra bits for each length code
+ = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
+ 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0};
+
+ CtData static_ltree[kLCodes + 2] = {
+ {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8},
+ {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8},
+ {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8},
+ {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8},
+ {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8},
+ {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8},
+ {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8},
+ {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8},
+ {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8},
+ {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8},
+ {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8},
+ {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8},
+ {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8},
+ {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8},
+ {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8},
+ {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8},
+ {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8},
+ {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8},
+ {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8},
+ {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8},
+ {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9},
+ {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9},
+ {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+ {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9},
+ {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+ {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9},
+ {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9},
+ {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9},
+ {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9},
+ {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9},
+ {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9},
+ {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+ {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9},
+ {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+ {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9},
+ {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9},
+ {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7},
+ {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7},
+ {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7},
+ {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7},
+ {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8},
+ {227, 8},
+ };
+
+ // distance codes. The first 256 values correspond to the distances
+ // 3 .. 258, the last 256 values correspond to the top 8 bits of
+ // the 15 bit distances.
+ unsigned char dist_code[512] = {
+ 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8,
+ 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 0, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21,
+ 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29,
+ };
+ // length code for each normalized match length (0 == kMinMatch)
+ unsigned char length_code[kMaxMatch - kMinMatch + 1] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
+ 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16,
+ 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18,
+ 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+ 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 28,
+ };
+
+ int extra_dbits[kDCodes] // extra bits for each distance code
+ = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
+
+ int base_dist[kDCodes] = {
+ 0, 1, 2, 3, 4, 6, 8, 12, 16, 24,
+ 32, 48, 64, 96, 128, 192, 256, 384, 512, 768,
+ 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576,
+ };
+
+ CtData static_dtree[kDCodes] = {
+ {0, 5}, {16, 5}, {8, 5}, {24, 5}, {4, 5}, {20, 5}, {12, 5}, {28, 5},
+ {2, 5}, {18, 5}, {10, 5}, {26, 5}, {6, 5}, {22, 5}, {14, 5}, {30, 5},
+ {1, 5}, {17, 5}, {9, 5}, {25, 5}, {5, 5}, {21, 5}, {13, 5}, {29, 5},
+ {3, 5}, {19, 5}, {11, 5}, {27, 5}, {7, 5}, {23, 5},
+ };
+
+ lc = len - kMinMatch;
+ code = length_code[lc];
+
+ local_lbits = static_ltree[code + kLiterals + 1].code;
+ local_llen = static_ltree[code + kLiterals + 1].len;
+ extra = extra_lbits[code];
+ if (extra) {
+ lc -= base_length[code];
+ local_lbits |= lc << local_llen;
+ local_llen += extra;
+ }
+
+ dist = initial_dist;
+ dist--;
+ code = d_code(dist);
+ local_dbits = static_dtree[code].code;
+ local_dlen = static_dtree[code].len;
+ extra = extra_dbits[code];
+ if (extra) {
+ dist -= base_dist[code];
+ local_dbits |= dist << local_dlen;
+ local_dlen += extra;
+ }
+
+ local_lbits |= local_dbits << local_llen;
+ local_llen += local_dlen;
+
+ return local_lbits;
+}
+
+int GetHuffLen(int len, int dist, unsigned char ch) {
+ int returned_len;
+
+ CtData static_ltree[kLCodes + 2] = {
+ {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8},
+ {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8},
+ {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8},
+ {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8},
+ {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8},
+ {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8},
+ {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8},
+ {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8},
+ {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8},
+ {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8},
+ {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8},
+ {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8},
+ {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8},
+ {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8},
+ {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8},
+ {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8},
+ {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8},
+ {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8},
+ {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8},
+ {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8},
+ {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9},
+ {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9},
+ {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+ {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9},
+ {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+ {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9},
+ {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9},
+ {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9},
+ {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9},
+ {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9},
+ {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9},
+ {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+ {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9},
+ {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+ {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9},
+ {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9},
+ {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7},
+ {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7},
+ {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7},
+ {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7},
+ {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8},
+ {227, 8},
+ };
+ switch (len) {
+ case -3:
+ returned_len = static_ltree[kEndBlock].len;
+ break;
+ case -2:
+ returned_len = 3;
+ break;
+ case -1:
+ returned_len = 0;
+ break;
+ case 0:
+ returned_len = GetHuffLiteralLen(ch);
+ break;
+ default:
+ returned_len = GetHuffRunLen(len, dist);
+ break;
+ }
+ return returned_len;
+}
+
+int IsValid(int len, int dist, unsigned char ch) {
+ switch (len) {
+ case -3:
+ return 1;
+ case -2:
+ return 1;
+ case -1:
+ return 0;
+ case 0:
+ return 1;
+ default:
+ return 1;
+ }
+}
+
+int GetHuffBits(int len, int dist, unsigned char ch) {
+ int bits;
+ CtData static_ltree[kLCodes + 2] = {
+ {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8},
+ {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8},
+ {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8},
+ {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8},
+ {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8},
+ {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8},
+ {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8},
+ {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8},
+ {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8},
+ {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8},
+ {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8},
+ {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8},
+ {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8},
+ {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8},
+ {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8},
+ {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8},
+ {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8},
+ {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8},
+ {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8},
+ {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8},
+ {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9},
+ {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9},
+ {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+ {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9},
+ {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+ {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9},
+ {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9},
+ {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9},
+ {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9},
+ {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9},
+ {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9},
+ {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+ {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9},
+ {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+ {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9},
+ {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9},
+ {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7},
+ {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7},
+ {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7},
+ {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7},
+ {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8},
+ {227, 8},
+ };
+ switch (len) {
+ case -3:
+ bits = static_ltree[kEndBlock].code;
+ break;
+ case -2:
+ bits = ch;
+ break;
+ case -1:
+ bits = 0;
+ break;
+ case 0:
+ bits = GetHuffLiteralBits(ch);
+ break;
+ default:
+ bits = GetHuffRunBits(len, dist);
+ break;
+ }
+ return bits;
+}
+
+// assembles up to kVecX2 unsigned char values based on given huffman encoding
+// writes up to kMaxHuffcodeBits * kVecX2 bits to memory
+bool HufEnc(char *len, short *dist, unsigned char *data, unsigned int *outdata,
+ unsigned int *leftover, unsigned short *leftover_size) {
+ // array that contains the bit position of each symbol
+ unsigned short bitpos[kVec + 1];
+ bitpos[0] = 0;
+
+ Unroller<0, kVec>::step([&](int i) {
+ bitpos[i + 1] = bitpos[i] + (IsValid(len[i], dist[i], data[i])
+ ? GetHuffLen(len[i], dist[i], data[i])
+ : 0);
+ });
+
+ // leftover is an array that carries huffman encoded data not yet written to
+ // memory adjust leftover_size with the number of bits to write this time
+ unsigned short prev_cycle_offset = *leftover_size;
+ *leftover_size += (bitpos[kVec] & 0x3fff);
+
+ // we'll write this cycle if we have collected enough data (kVec shorts or
+ // more)
+ bool write = *leftover_size & (kVec * (kMaxHuffcodeBits * 2));
+
+ // subtract kVec shorts from leftover size (if it's bigger
+ // than kVec) because we'll write those out this cycle
+ *leftover_size &= ~(kVec * (kMaxHuffcodeBits * 2));
+
+ // Adjust bitpos based on leftover offset from previous cycle
+ Unroller<0, kVec>::step(
+ [&](int i) { bitpos[i] += (prev_cycle_offset & 0x3fff); });
+
+ // Huffman codes have any bit alignement, so they can spill
+ // onto two shorts in the output array
+ // use ushort2 to keep each part of the code separate
+ // Iterate over all codes and construct ushort2 containing
+ // the code properly aligned
+ struct Uint2Gzip code[kVec];
+ Unroller<0, kVec>::step([&](int i) {
+ code[i].x = 0;
+ code[i].y = 0;
+ });
+
+ Unroller<0, kVec>::step([&](int i) {
+ // Codes can be more than 16 bits, so use uint32
+ unsigned int curr_code = GetHuffBits(len[i], dist[i], data[i]);
+ unsigned char bitpos_in_short = bitpos[i] & 0x01F;
+
+ unsigned long long temp = (unsigned long long)curr_code << bitpos_in_short;
+ unsigned int temp1 = (unsigned int)temp;
+ unsigned int temp2 = temp >> 32ULL;
+
+ if (IsValid(len[i], dist[i], data[i])) {
+ code[i].x = temp1;
+ code[i].y = temp2;
+ } else {
+ code[i].x = temp1;
+ code[i].y = temp2;
+ }
+ });
+
+ // Iterate over all destination locations and gather the required data
+ unsigned int new_leftover[kVec];
+ Unroller<0, kVec>::step([&](int i) {
+ new_leftover[i] = 0;
+ outdata[i] = 0;
+
+ Unroller<0, kVec>::step([&](int j) {
+ // figure out whether code[j] goes into bucket[i]
+ bool match_first = ((bitpos[j] >> 5) & (kVec - 1)) == i;
+ bool match_second =
+ ((bitpos[j] >> 5) & (kVec - 1)) == ((i - 1) & (kVec - 1));
+
+ // if code[j] maps onto current bucket then OR its code, else OR with 0
+ unsigned int component =
+ match_first ? code[j].x : (match_second ? code[j].y : 0);
+
+ // overflow from kVec shorts, need to move onto new_leftover
+ bool use_later =
+ (bitpos[j] & (kVec * (kMaxHuffcodeBits * 2))) ||
+ (match_second && (((bitpos[j] >> 5) & (kVec - 1)) == kVec - 1));
+
+ // write to output
+ outdata[i] |= use_later ? 0 : component;
+ new_leftover[i] |= use_later ? component : 0;
+ });
+ });
+
+ // Apply previous leftover on the outdata
+ // Also, if didn't write, apply prev leftover onto newleftover
+ Unroller<0, kVec>::step([&](int i) {
+ outdata[i] |= leftover[i];
+ leftover[i] = outdata[i];
+ });
+
+ // Split unroll into two unrolls to avoid compiler crash. This is a temporary
+ // workaround while awaiting a compiler feature.
+ if (write) {
+ Unroller<0, kVec>::step([&](int i) { leftover[i] = new_leftover[i]; });
+ }
+
+ return write;
+}
+
+template
+class CRC;
+template
+class LZReduction;
+template