diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/CMakeLists.txt
new file mode 100755
index 0000000000..6ae6386d49
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(CRR)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/README.md b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/README.md
new file mode 100755
index 0000000000..ab98bae8d7
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/README.md
@@ -0,0 +1,224 @@
+# CRR Binomial Tree Model for Option Pricing
+An FPGA-optimized reference design computing the Cox-Ross-Rubinstein (CRR) binomial tree model with Greeks for American exercise options.
+
+The [FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a resource for general target-independent DPC++ programming. Additional reference material specific to option pricing algorithms is provided in the References section of this README.
+
+| Optimized for | Description
+--- |---
+| OS | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit
+| What you will learn | Review a high performance DPC++ design optimized for FPGA
+| Time to complete | 1 hr (not including compile time)
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+
+**Performance**
+Please refer to performance disclaimer at the end of this README.
+
+| Device | Throughput
+|:--- |:---
+| Intel® PAC with Intel Arria® 10 GX FPGA | 118 assets/s
+| Intel® PAC with Intel Stratix® 10 SX FPGA | 243 assets/s
+
+
+## Purpose
+This sample implements the Cox-Ross-Rubinstein (CRR) binomial tree model that is used in the finance field for American exercise options with five Greeks (delta, gamma, theta, vega and rho). The simple idea is to model all possible assets price paths using a binomial tree.
+
+## Key Implementation Details
+
+### Design Inputs
+This design reads inputs from the `ordered_inputs.csv` file. The inputs are:
+
+| Input | Description
+--- |---
+| `n_steps` | Number of time steps in the binomial tree. The maximum `n_steps` in this design is 8189.
+| `cp` | -1 or 1 represents put and call options, respectively.
+| `spot` | Spot price of the underlying price.
+| `fwd` | Forward price of the underlying price.
+| `strike` | Exercise price of the option.
+| `vol` | Percent volatility that the design reads as a decimal value.
+| `df` | Discount factor to option expiry.
+| `t` | Time, in years, to the maturity of the option.
+
+### Design Outputs
+This design writes outputs to the `ordered_outputs.csv` file. The outputs are:
+
+| Output | Description
+--- |---
+| `value` | Option price
+| `delta` | Measures the rate of change of the theoretical option value with respect to changes in the underlying asset's price.
+| `gamma` | Measures the rate of change in the `delta` with respect to changes in the underlying price.
+| `vega` | Measures sensitivity to volatility.
+| `theta` | Measures the sensitivity of the value of the derivative to the passage of time.
+| `rho` | Measures sensitivity to the interest of rate.
+
+### Design Correctness
+This design tests the correctness of the optimized FPGA code by comparing its output to a golden result computed on the CPU.
+
+### Design Performance
+This design measures the FPGA performance to determine how many assets can be processed per second.
+
+## License
+This code sample is licensed under MIT license.
+
+## Building the CRR Program
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (FPGA) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 48h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+ ```
+ mkdir build
+ cd build
+ ```
+ To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:
+ ```
+ cmake ..
+ ```
+ Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+ ```
+ cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+ ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ make fpga_emu
+ ```
+ * Generate the optimization report:
+ ```
+ make report
+ ```
+ * Compile for FPGA hardware (longer compile time, targets FPGA device):
+ ```
+ make fpga
+ ```
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead.
+
+1. Enter the source file directory.
+ ```
+ cd src
+ ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ ninja fpga_emu
+ ```
+
+ * Generate the optimization report:
+
+ ```
+ ninja report
+ ```
+ If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+ ```
+ ninja report_s10_pac
+ ```
+ * Compiling for FPGA hardware is not yet supported on Windows.
+
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Running the Reference Design
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+ ```
+ ./crr.fpga_emu [-o=] (Linux)
+
+ crr.fpga_emu.exe [-o=] (Windows)
+ ```
+ 2. Run the sample on the FPGA device:
+ ```
+ ./crr.fpga [-o=] (Linux)
+ ```
+
+### Application Parameters
+
+| Argument | Description
+--- |---
+| `` | Optional argument that provides the input data. The default file is `/data/ordered_inputs.csv`
+| `-o=` | Optional argument that specifies the name of the output file. The default name of the output file is `ordered_outputs.csv`.
+
+### Example of Output
+```
+============ Correctness Test =============
+Running analytical correctness checks...
+CPU-FPGA Equivalence: PASS
+
+============ Throughput Test =============
+Avg throughput: 66.2 assets/s
+```
+
+## Additional Design Information
+
+### Source Code Explanation
+
+| File | Description
+--- |---
+| `main.cpp` | Contains both host code and SYCL* kernel code.
+| `CRR_common.hpp` | Header file for `main.cpp`. Contains the data structures needed for both host code and SYCL* kernel code.
+
+
+
+### Backend Compiler Flags Used
+
+| Flag | Description
+--- |---
+`-Xshardware` | Target FPGA hardware (as opposed to FPGA emulator)
+`-Xsdaz` | Denormals are zero
+`-Xsrounding=faithful` | Rounds results to either the upper or lower nearest single-precision numbers
+`-Xsparallel=2` | Uses 2 cores when compiling the bitstream through Quartus
+`-Xsseed=2` | Uses seed 2 during Quartus, yields slightly higher fMAX
+
+### Preprocessor Define Flags
+
+| Flag | Description
+--- |---
+`-DOUTER_UNROLL=1` | Uses the value 1 for the constant OUTER_UNROLL, controls the number of CRRs that can be processed in parallel
+`-DINNER_UNROLL=64` | Uses the value 64 for the constant INNER_UNROLL, controls the degree of parallelization within the calculation of 1 CRR
+`-DOUTER_UNROLL_POW2=1` | Uses the value 1 for the constant OUTER_UNROLL_POW2, controls the number of memory banks
+
+
+NOTE: The Xsseed, DOUTER_UNROLL, DINNER_UNROLL and DOUTER_UNROLL_POW2 values differ depending on the board being targeted. More information about the unroll factors can be found in `/src/CRR_common.hpp`.
+
+### Performance disclaimers
+
+Tests document performance of components on a particular test, in specific systems. Differences in hardware, software, or configuration will affect actual performance. Consult other sources of information to evaluate performance as you consider your purchase. For more complete information about performance and benchmark results, visit [www.intel.com/benchmarks](www.intel.com/benchmarks).
+
+Performance results are based on testing as of July 20, 2020 and may not reflect all publicly available security updates. See configuration disclosure for details. No product or component can be absolutely secure.
+
+Intel technologies’ features and benefits depend on system configuration and may require enabled hardware, software or service activation. Performance varies depending on system configuration. Check with your system manufacturer or retailer or learn more at [intel.com](www.intel.com).
+
+The performance was measured by Intel on July 20, 2020
+
+Intel and the Intel logo are trademarks of Intel Corporation or its subsidiaries in the U.S. and/or other countries.
+
+(C) Intel Corporation.
+
+### References
+
+[Khronous SYCL Resources](https://www.khronos.org/sycl/resources)
+
+[Binomial options pricing model](https://en.wikipedia.org/wiki/Binomial_options_pricing_model)
+
+[Wike page for finance Greeks](https://en.wikipedia.org/wiki/Greeks_(finance))
+
+[OpenCL Intercept Layer](https://github.com/intel/opencl-intercept-layer)
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.sln b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.sln
new file mode 100755
index 0000000000..a95fce9c30
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.sln
@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "crr", "crr.vcxproj", "{8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|x64 = Debug|x64
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}.Debug|x64.ActiveCfg = Debug|x64
+ {8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}.Debug|x64.Build.0 = Debug|x64
+ {8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}.Release|x64.ActiveCfg = Release|x64
+ {8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ SolutionGuid = {6887ACDD-3E54-4396-A921-99C630333932}
+ EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj
new file mode 100755
index 0000000000..62a523e96c
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj
@@ -0,0 +1,165 @@
+
+
+
+
+ Debug
+ x64
+
+
+ Release
+ x64
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 15.0
+ {8eb512ff-4487-4fec-9b88-8c0da734b1b2}
+ Win32Proj
+ crr
+ $(WindowsSDKVersion.Replace("\",""))
+
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true
+
+
+ true
+
+
+ false
+
+
+ false
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR -DOUTER_UNROLL=1 -DINNER_UNROLL=64 -DOUTER_UNROLL_POW2=1 %(AdditionalOptions)
+ false
+ $(IntDir)crr.obj
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR -DOUTER_UNROLL=1 -DINNER_UNROLL=64 -DOUTER_UNROLL_POW2=1 %(AdditionalOptions)
+ $(IntDir)crr.obj
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj.user b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj.user
new file mode 100755
index 0000000000..9115b3f275
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj.user
@@ -0,0 +1,14 @@
+
+
+
+ false
+
+
+ ./src/data/ordered_inputs.csv -o=./src/data/ordered_outputs.csv
+ WindowsLocalDebugger
+
+
+ ./src/data/ordered_inputs.csv -o=./src/data/ordered_outputs.csv
+ WindowsLocalDebugger
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/sample.json b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/sample.json
new file mode 100755
index 0000000000..6155ce223d
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/sample.json
@@ -0,0 +1,51 @@
+{
+ "guid": "D725E06E-0ECE-44F8-910D-AD1A8C89ED89",
+ "name": "CRR Binomial Tree Model for Option Pricing",
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Reference Designs"],
+ "description": "FPGA-optimized reference design of the Cox-Ross-Rubinstein (CRR) binomial tree model with Greeks for American exercise options",
+ "toolchain": ["dpcpp"],
+ "os": ["linux", "windows"],
+ "builder": ["ide", "cmake"],
+ "targetDevice": ["FPGA"],
+ "languages": [{"cpp":{}}],
+ "ciTests": {
+ "linux": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make fpga_emu",
+ "./crr.fpga_emu ./src/data/ordered_inputs.csv -o=./src/data/ordered_outputs.csv"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make report"
+ ]
+ }
+ ],
+ "windows": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "cd src",
+ "ninja fpga_emu",
+ "crr.fpga_emu.exe ./data/ordered_inputs.csv -o=./data/ordered_outputs.csv"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "cd src",
+ "ninja report"
+ ]
+ }
+ ]
+ }
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CMakeLists.txt
new file mode 100755
index 0000000000..8c56a699ad
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CMakeLists.txt
@@ -0,0 +1,116 @@
+set(SOURCE_FILE main.cpp)
+set(TARGET_NAME crr)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+set(REPORTS_TARGET ${TARGET_NAME}_report)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Design specific constant values
+set(OUTER_UNROLL_A10 1)
+set(INNER_UNROLL_A10 64)
+set(OUTER_UNROLL_POW2_A10 1)
+set(OUTER_UNROLL_S10 2)
+set(INNER_UNROLL_S10 64)
+set(OUTER_UNROLL_POW2_S10 2)
+set(SEED_A10 1)
+set(SEED_S10 2)
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+SET(OUTER_UNROLL ${OUTER_UNROLL_A10})
+SET(INNER_UNROLL ${INNER_UNROLL_A10})
+SET(OUTER_UNROLL_POW2 ${OUTER_UNROLL_POW2_A10})
+SET(SEED ${SEED_A10})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+ MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+ MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+ MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+ SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+ SET(OUTER_UNROLL ${OUTER_UNROLL_S10})
+ SET(INNER_UNROLL ${INNER_UNROLL_S10})
+ SET(OUTER_UNROLL_POW2 ${OUTER_UNROLL_POW2_S10})
+ SET(SEED ${SEED_S10})
+
+ELSE()
+ MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS -fintelfpga -c -DOUTER_UNROLL=${OUTER_UNROLL} -DINNER_UNROLL=${INNER_UNROLL} -DOUTER_UNROLL_POW2=${OUTER_UNROLL_POW2})
+
+# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation
+separate_arguments(USER_HARDWARE_FLAGS)
+set(HARDWARE_LINK_FLAGS -fintelfpga -Xshardware -Xsdaz -Xsrounding=faithful -Xsparallel=2 -Xsseed=${SEED} -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS} -DOUTER_UNROLL=${OUTER_UNROLL} -DINNER_UNROLL=${INNER_UNROLL} -DOUTER_UNROLL_POW2=${OUTER_UNROLL_POW2})
+set(FINAL_LINK_FLAGS -fintelfpga -DOUTER_UNROLL=${OUTER_UNROLL} -DINNER_UNROLL=${INNER_UNROLL} -DOUTER_UNROLL_POW2=${OUTER_UNROLL_POW2})
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR -DOUTER_UNROLL=${OUTER_UNROLL} -DINNER_UNROLL=${INNER_UNROLL} -DOUTER_UNROLL_POW2=${OUTER_UNROLL_POW2}")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+#copy input data
+configure_file("data/ordered_inputs.csv" "data/ordered_inputs.csv" COPYONLY)
+
+# fpga emulator
+if(WIN32)
+ set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+ add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+ separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+ add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+ DEPENDS ${SOURCE_FILE})
+else()
+ add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+ add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+ set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+ set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpgas
+if(WIN32)
+ add_custom_target(fpga
+ COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+ add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+ set(DEVICE_FPGA_OBJ "crr_fpga.o")
+
+ add_custom_command(OUTPUT ${DEVICE_FPGA_OBJ}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${DEVICE_FPGA_OBJ}
+ DEPENDS ${SOURCE_FILE})
+
+ add_custom_command(OUTPUT ${FPGA_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} ${DEVICE_FPGA_OBJ} -o ${CMAKE_BINARY_DIR}/${FPGA_TARGET}
+ DEPENDS ${DEVICE_FPGA_OBJ})
+endif()
+
+# fpga report
+if(WIN32)
+ add_custom_target(report DEPENDS ${REPORTS_TARGET} )
+
+ separate_arguments(WIN_FLAGS WINDOWS_COMMAND)
+ add_custom_command(OUTPUT ${REPORTS_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${WIN_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET}
+ DEPENDS ${SOURCE_FILE})
+
+else()
+ add_custom_target(report DEPENDS ${REPORTS_TARGET} )
+
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/CRR_common.hpp CRR_common.hpp COPYONLY)
+
+ add_custom_command(OUTPUT ${REPORTS_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET}
+ DEPENDS ${SOURCE_FILE} CRR_common.hpp)
+endif()
+
+# run
+add_custom_target(run
+ COMMAND ../${TARGET_NAME}.fpga_emu data/ordered_inputs.csv -o=data/ordered_output.csv
+ DEPENDS ${TARGET_NAME}.fpga_emu)
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CRR_common.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CRR_common.hpp
new file mode 100755
index 0000000000..6f2537e1e0
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CRR_common.hpp
@@ -0,0 +1,149 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#ifndef __CRR_COMMON_H__
+#define __CRR_COMMON_H__
+
+constexpr int kMaxStringLen = 1024;
+
+// Increments of kMaxNSteps
+constexpr size_t kMaxNSteps = 8189;
+constexpr size_t kMaxNSteps1 = 8190;
+constexpr size_t kMaxNSteps2 = 8191;
+constexpr size_t kMaxNSteps3 = 8192;
+
+// Increment by a small epsilon in order to compute derivative
+// of option price with respect to Vol or Interest. The derivatives
+// are then used to compute Vega and Rho.
+constexpr double kEpsilon = 0.0001;
+
+// Whenever calculations are made for Option Price 0, need to increment
+// nsteps by 2 to ensure all the required derivative prices are calculated.
+constexpr size_t kOpt0 = 2;
+
+
+// Solver configuration settings that are dependent on selected
+// board. Most notable settings are:
+
+// OUTER_UNROLL controls the number of CRRs that can be processed
+// in parallel in a SIMD fashion (number of CRRS must be >= OUTER_UNROLL).
+// This is ideally a power of two, but does not have to be. Since
+// the DRAM bandwidth requirement is low, increasing OUTER_UNROLL
+// should result in fairly linear speedup. (max: 32 on PAC A10)
+
+// INNER_UNROLL controls the degree of parallelization within
+// the calculation of a single CRR. This must be a power of two. Increasing
+// INNER_UNROLL has a lower area overhead than increasing OUTER_UNROLL;
+// however, there are diminishing returns as INNER_UNROLL is increased with
+// respect to the number of time steps. (max: 128 on PAC A10)
+
+
+// Data structure for original input data.
+typedef struct {
+ int cp; /* cp = -1 or 1 for Put & Call respectively. */
+ double n_steps; /* n_steps = number of time steps in the binomial tree. */
+ double strike; /* strike = exercise price of option. */
+ double spot; /* spot = spot price of the underlying. */
+ double fwd; /* fwd = forward price of the underlying. */
+ double vol; /* vol = per cent volatility, input as a decimal. */
+ double df; /* df = discount factor to option expiry. */
+ double t; /* t = time in years to the maturity of the option. */
+
+} InputData;
+
+// Data structure as the inputs to FPGA.
+// Element[i] is used to compute option_price[i].
+typedef struct {
+ double n_steps; /* n_steps = number of time steps in the binomial tree. */
+ double u[3]; /* u = the increase factor of a up movement in the binomial tree,
+ same for each time step. */
+ double u2[3]; /* u2 = the square of increase factor. */
+ double c1[3]; /* c1 = the probality of a down movement in the binomial tree,
+ same for each time step. */
+ double c2[3]; /* c2 = the probality of a up movement in the binomial tree. */
+ double umin[3]; /* umin = minimum price of the underlying at the maturity. */
+ double param_1[3];/* param_1[i] = cp * umin[i] */
+ double param_2; /* param_2 = cp * strike */
+
+} CRRInParams;
+
+// Data structure as the output from ProcessKernelResult().
+typedef struct {
+ double pgreek[4]; /* Stores the 4 derivative prices in the binomial tree
+ required to compute the Premium and Greeks. */
+ double vals[3]; /* Three option prices calculated */
+
+} InterRes;
+
+// Data structure for option price and five Greeks.
+typedef struct {
+ double value; /* value = option price. */
+ double delta;
+ double gamma;
+ double vega;
+ double theta;
+ double rho;
+} OutputRes;
+
+// Data structures required by the kernel
+typedef struct {
+ double u;
+ double c1;
+ double c2;
+ double param_1;
+ double param_2;
+ short n_steps;
+ short pad1;
+ int pad2;
+ double pad3;
+ double pad4;
+} CRRMeta;
+
+typedef struct {
+ double u2;
+ double p1powu;
+ double init_optval;
+ double pad;
+} ArrayEle;
+
+typedef struct {
+ ArrayEle array_eles[kMaxNSteps3][3]; /* Second dimension size set to 3 to have a
+ separate ArrayEle for each option price */
+} CRRArrayEles;
+
+typedef struct {
+ ArrayEle array_eles[kMaxNSteps3];
+} CRRPerStepMeta;
+
+typedef struct {
+ double pgreek[4];
+ double optval0;
+ double pad[3];
+} CRRResParams;
+
+#endif
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/build.ninja b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/build.ninja
new file mode 100755
index 0000000000..58af917f67
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/build.ninja
@@ -0,0 +1,35 @@
+source_file = main.cpp
+target_name = crr
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware -Xsfpc -Xsparallel=2 -Xsseed=5
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+a10_flags = -DOUTER_UNROLL=1 -DINNER_UNROLL=64 -DOUTER_UNROLL_POW2=1
+s10_flags = -DOUTER_UNROLL=2 -DINNER_UNROLL=64 -DOUTER_UNROLL_POW2=2
+
+rule build_fpga_emu
+ command = dpcpp /GX ${emulator_flags} ${a10_flags} $in -o $out
+
+rule build_fpga_emu_s10
+ command = dpcpp /GX ${emulator_flags} -Xsboard=intel_s10sx_pac:pac_s10 ${s10_flags} $in -o $out
+
+rule gen_report
+ command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 ${a10_flags} -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+ command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 ${s10_flags} -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/data/ordered_inputs.csv b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/data/ordered_inputs.csv
new file mode 100755
index 0000000000..3a28083fa2
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/data/ordered_inputs.csv
@@ -0,0 +1,10 @@
+8189,-1,37.5,37.50112053,85,0.4,0.99997012,0.011952191
+8189,1,37.5,37.50112053,85,0.4,0.99997012,0.011952191
+8189,-1,270,270.0080678,65,0.18,0.999940241,0.011952191
+8189,1,270,270.0080678,65,0.18,0.999940241,0.011952191
+8189,-1,292.5,292.5087402,70,0.35,0.999940241,0.011952191
+8189,1,292.5,292.5087402,70,0.35,0.999940241,0.011952191
+8189,-1,122.5,122.5109816,40,0.2,0.999910363,0.011952191
+8189,1,122.5,122.5109816,40,0.2,0.999910363,0.011952191
+8189,-1,22.5,22.50067232,55,0.3,0.999910363,0.011952191
+8189,1,22.5,22.50067232,55,0.3,0.999910363,0.011952191
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/main.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/main.cpp
new file mode 100755
index 0000000000..7c92610e19
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/main.cpp
@@ -0,0 +1,849 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// CRRSolver CPU/FPGA Accelerator Demo Program
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// This design implments simple Cox-Ross-Rubinstein(CRR) binomial tree model
+// with Greeks for American exercise options.
+//
+//
+// Optimization summary:
+// -- Area-consuming but infrequent calculation is done on CPU.
+// -- Parallelize the calculation of a single CRR.
+// -- Run multiple independent CRRs in parallel.
+// -- Optimized memory configurations to reduce the need for replication
+// and to eliminate the need for double-pumping M20Ks.
+//
+// The following diagram shows the mechanism of optimizations to CRR.
+//
+//
+// +------+ ^
+// +------------>|optval| |
+// | | [2] | |
+// | +------+ |
+// | |
+// | |
+// +--+---+ |
+// +------------>|optval| |
+// | | [1] | |
+// | +--+---+ |
+// | | |
+// | | |
+// | | | Loop4(L4)
+// | | | updates
+// +---+--+ +------------>+------+ | multiple
+// |optval| |optval| | elements
+// | [0] | | [1] | | in optval[]
+// +---+--+ +------------>+------+ | simultaneously
+// | | |
+// | | |
+// | | |
+// | | |
+// | +--+---+ |
+// | |optval| |
+// +------------>| [0] | |
+// +--+---+ |
+// | |
+// | |
+// | +------+ |
+// | |optval| |
+// +------------>| [0] | |
+// +------+ +
+//
+//
+//
+//
+// step 1 step 2
+//
+//
+// <------------------------------------------+
+// Loop3(L3) updates each level of the tree
+//
+//
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "CRR_common.hpp"
+#include "dpc_common.hpp"
+
+using namespace std;
+using namespace sycl;
+
+class CRRSolver;
+double CrrSolver(const int n_items, vector &in_params,
+ vector &res_params,
+ vector &in_params2, queue &q) {
+ dpc_common::TimeInterval timer;
+
+ constexpr int steps = kMaxNSteps2;
+
+ const int n_crr =
+ (((n_items + (OUTER_UNROLL - 1)) / OUTER_UNROLL) * OUTER_UNROLL) * 3;
+
+ {
+ buffer i_params(in_params.data(), in_params.size());
+ buffer r_params(res_params.data(), res_params.size());
+ buffer a_params(in_params2.data(), in_params2.size());
+
+ event e;
+ {
+ e = q.submit([&](handler &h) {
+ auto accessor_v =
+ i_params.template get_access(h);
+
+ auto accessor_v2 =
+ a_params.template get_access(h);
+
+ auto accessor_r =
+ r_params.template get_access(h);
+
+ h.single_task([=]() [[intel::kernel_args_restrict]] {
+ // Kernel requires n_crr to be a multiple of OUTER_UNROLL.
+ // This is taken care of by the host.
+ const int n_crr_div = n_crr / OUTER_UNROLL;
+
+ // Outerloop counter. Use while-loop for better timing-closure
+ // characteristics because it tells the compiler the loop body will
+ // never be skipped.
+ int oc = 0;
+ do {
+ // Metadata of CRR problems
+ [[intelfpga::register]] double u[OUTER_UNROLL];
+ [[intelfpga::register]] double c1[OUTER_UNROLL];
+ [[intelfpga::register]] double c2[OUTER_UNROLL];
+ [[intelfpga::register]] double param_1[OUTER_UNROLL];
+ [[intelfpga::register]] double param_2[OUTER_UNROLL];
+ [[intelfpga::register]] short n_steps[OUTER_UNROLL];
+
+ // Current values in binomial tree. We only need to keep track of
+ // one level worth of data, not the entire tree.
+ [[intelfpga::memory, intelfpga::singlepump,
+ intelfpga::bankwidth(sizeof(double)),
+ intelfpga::numbanks(INNER_UNROLL * OUTER_UNROLL_POW2),
+ intelfpga::private_copies(
+ 8)]] double optval[kMaxNSteps3][OUTER_UNROLL_POW2];
+
+ // Initial values in binomial tree, which correspond to the last
+ // level of the binomial tree.
+ [[intelfpga::memory, intelfpga::singlepump,
+ intelfpga::bankwidth(sizeof(double)),
+ intelfpga::numbanks(INNER_UNROLL * OUTER_UNROLL_POW2),
+ intelfpga::private_copies(
+ 8)]] double init_optval[kMaxNSteps3][OUTER_UNROLL_POW2];
+
+ // u2_array precalculates the power function of u2.
+ [[intelfpga::memory, intelfpga::singlepump,
+ intelfpga::bankwidth(sizeof(double)),
+ intelfpga::numbanks(INNER_UNROLL * OUTER_UNROLL_POW2),
+ intelfpga::private_copies(
+ 8)]] double u2_array[kMaxNSteps3][OUTER_UNROLL_POW2];
+
+ // p1powu_array precalculates p1 multipy the power of u.
+ [[intelfpga::memory, intelfpga::singlepump,
+ intelfpga::bankwidth(sizeof(double)),
+ intelfpga::numbanks(INNER_UNROLL * OUTER_UNROLL_POW2),
+ intelfpga::private_copies(
+ 8)]] double p1powu_array[kMaxNSteps3][OUTER_UNROLL_POW2];
+
+ // n0_optval stores the binomial tree value corresponding to node 0
+ // of a level. This is the same as what's stored in
+ // optval/init_optval, but replicating this data allows us to have
+ // only one read port for optval and init_optval, thereby removing
+ // the need of double-pumping or replication. n0_optval_2 is a copy
+ // of n0_optval that stores the node 0 value for a specific layer of
+ // the tree. pgreek is the array saving values for post-calculating
+ // Greeks.
+ [[intelfpga::register]] double n0_optval[OUTER_UNROLL];
+ [[intelfpga::register]] double n0_optval_2[OUTER_UNROLL];
+ [[intelfpga::register]] double pgreek[4][OUTER_UNROLL];
+
+ // L1 + L2:
+ // Populate init_optval -- calculate the last level of the binomial
+ // tree.
+ for (short ic = 0; ic < OUTER_UNROLL; ++ic) {
+ // Transfer data from DRAM to local memory or registers
+ const int c = oc * OUTER_UNROLL + ic;
+ const CRRMeta param = accessor_v[c];
+
+ u[ic] = param.u;
+ c1[ic] = param.c1;
+ c2[ic] = param.c2;
+ param_1[ic] = param.param_1;
+ param_2[ic] = param.param_2;
+ n_steps[ic] = param.n_steps;
+
+ for (short t = steps; t >= 0; --t) {
+ const ArrayEle param_array = accessor_v2[c].array_eles[t];
+
+ const double init_val = param_array.init_optval;
+
+ init_optval[t][ic] = init_val;
+
+ // n0_optval intends to store the node value at t == 0.
+ // Instead of qualifying this statement by an "if (t == 0)",
+ // which couples the loop counter to the timing path of the
+ // assignment, we reverse the loop direction so the last value
+ // stored corresponds to t == 0.
+ n0_optval[ic] = init_val;
+
+ // Transfer data from DRAM to local memory or registers
+ u2_array[t][ic] = param_array.u2;
+ p1powu_array[t][ic] = param_array.p1powu;
+ }
+ }
+
+ // L3:
+ // Update optval[] -- calculate each level of the binomial tree.
+ // reg[] helps to achieve updating INNER_UNROLL elements in optval[]
+ // simultaneously.
+ [[intelfpga::disable_loop_pipelining]] for (short t = 0;
+ t <= steps - 1; ++t) {
+ [[intelfpga::register]] double reg[INNER_UNROLL + 1][OUTER_UNROLL];
+
+ double val_1, val_2;
+
+ #pragma unroll
+ for (short ic = 0; ic < OUTER_UNROLL; ++ic) {
+ reg[0][ic] = n0_optval[ic];
+ }
+
+ // L4:
+ // Calculate all the elements in optval[] -- all the tree nodes
+ // for one level of the tree
+ [[intelfpga::ivdep]] for (int n = 0; n <= steps - 1 - t;
+ n += INNER_UNROLL) {
+
+ #pragma unroll
+ for (short ic = 0; ic < OUTER_UNROLL; ++ic) {
+
+ #pragma unroll
+ for (short ri = 1; ri <= INNER_UNROLL; ++ri) {
+ reg[ri][ic] =
+ (t == 0) ? init_optval[n + ri][ic] : optval[n + ri][ic];
+ }
+
+ #pragma unroll
+ for (short ri = 0; ri < INNER_UNROLL; ++ri) {
+ const double val = sycl::fmax(
+ c1[ic] * reg[ri][ic] + c2[ic] * reg[ri + 1][ic],
+ p1powu_array[t][ic] * u2_array[n + ri][ic] -
+ param_2[ic]);
+
+ optval[n + ri][ic] = val;
+ if (n + ri == 0) {
+ n0_optval[ic] = val;
+ }
+ if (n + ri == 1) {
+ val_1 = val;
+ }
+ if (n + ri == 2) {
+ val_2 = val;
+ }
+ }
+
+ reg[0][ic] = reg[INNER_UNROLL][ic];
+
+ if (t == steps - 5) {
+ pgreek[3][ic] = val_2;
+ }
+ if (t == steps - 3) {
+ pgreek[0][ic] = n0_optval[ic];
+ pgreek[1][ic] = val_1;
+ pgreek[2][ic] = val_2;
+ n0_optval_2[ic] = n0_optval[ic];
+ }
+ }
+ }
+ }
+
+ // L5: transfer crr_res_paramss to DRAM
+ #pragma unroll
+ for (short ic = 0; ic < OUTER_UNROLL; ++ic) {
+ const int c = oc * OUTER_UNROLL + ic;
+ if (n_steps[ic] < steps) {
+ accessor_r[c].optval0 = n0_optval_2[ic];
+ } else {
+ accessor_r[c].optval0 = n0_optval[ic];
+ }
+ accessor_r[c].pgreek[0] = pgreek[0][ic];
+ accessor_r[c].pgreek[1] = pgreek[1][ic];
+ accessor_r[c].pgreek[2] = pgreek[2][ic];
+ accessor_r[c].pgreek[3] = pgreek[3][ic];
+ }
+ // Increment counters
+ oc += 1;
+ } while (oc < n_crr_div);
+ });
+ });
+ }
+ }
+
+ double diff = timer.Elapsed();
+ return diff;
+}
+
+void ReadInputFromFile(ifstream &input_file, vector &inp) {
+ string line_of_args;
+ while (getline(input_file, line_of_args)) {
+ InputData temp;
+ istringstream line_of_args_ss(line_of_args);
+ line_of_args_ss >> temp.n_steps;
+ line_of_args_ss.ignore(1, ',');
+ line_of_args_ss >> temp.cp;
+ line_of_args_ss.ignore(1, ',');
+ line_of_args_ss >> temp.spot;
+ line_of_args_ss.ignore(1, ',');
+ line_of_args_ss >> temp.fwd;
+ line_of_args_ss.ignore(1, ',');
+ line_of_args_ss >> temp.strike;
+ line_of_args_ss.ignore(1, ',');
+ line_of_args_ss >> temp.vol;
+ line_of_args_ss.ignore(1, ',');
+ line_of_args_ss >> temp.df;
+ line_of_args_ss.ignore(1, ',');
+ line_of_args_ss >> temp.t;
+
+ inp.push_back(temp);
+ }
+}
+
+static string ToStringWithPrecision(const double value, const int p = 6) {
+ ostringstream out;
+ out.precision(p);
+ out << std::fixed << value;
+ return out.str();
+}
+
+void WriteOutputToFile(ofstream &output_file, const vector &outp) {
+ size_t n = outp.size();
+ for (size_t i = 0; i < n; ++i) {
+ OutputRes temp;
+ temp = outp[i];
+ string line = ToStringWithPrecision(temp.value, 12) + " " +
+ ToStringWithPrecision(temp.delta, 12) + " " +
+ ToStringWithPrecision(temp.gamma, 12) + " " +
+ ToStringWithPrecision(temp.vega, 12) + " " +
+ ToStringWithPrecision(temp.theta, 12) + " " +
+ ToStringWithPrecision(temp.rho, 12) + "\n";
+
+ output_file << line;
+ }
+}
+
+bool FindGetArgString(const string &arg, const char *str, char *str_value,
+ size_t maxchars) {
+ size_t found = arg.find(str, 0, strlen(str));
+ if (found != string::npos) {
+ const char *sptr = &arg.c_str()[strlen(str)];
+ for (int i = 0; i < maxchars - 1; i++) {
+ char ch = sptr[i];
+ switch (ch) {
+ case ' ':
+ case '\t':
+ case '\0':
+ str_value[i] = 0;
+ return true;
+ break;
+ default:
+ str_value[i] = ch;
+ break;
+ }
+ }
+ return true;
+ }
+ return false;
+}
+
+// Perform data pre-processing work
+// Three different option prices are required to solve each CRR problem
+// The following lists why each option price is required:
+// [0] : Used to compute Premium, Delta, Gamma and Theta
+// [1] : Used to compute Rho
+// [2] : Used to compute Vega
+CRRInParams PrepareData(const InputData &inp) {
+ CRRInParams in_params;
+ in_params.n_steps = inp.n_steps;
+
+ double r[2];
+ r[0] = pow(inp.df, 1.0 / inp.n_steps);
+ double d_df = exp(-inp.t * kEpsilon);
+ r[1] = pow(inp.df * d_df, 1.0 / inp.n_steps);
+ in_params.u[0] = exp(inp.vol * sqrt(inp.t / inp.n_steps));
+ in_params.u[1] = in_params.u[0];
+ in_params.u[2] = exp((inp.vol + kEpsilon) * sqrt(inp.t / inp.n_steps));
+
+ in_params.u2[0] = in_params.u[0] * in_params.u[0];
+ in_params.u2[1] = in_params.u[1] * in_params.u[1];
+ in_params.u2[2] = in_params.u[2] * in_params.u[2];
+ in_params.umin[0] = inp.spot * pow(1 / in_params.u[0], inp.n_steps + kOpt0);
+ in_params.umin[1] = inp.spot * pow(1 / in_params.u[1], inp.n_steps);
+ in_params.umin[2] = inp.spot * pow(1 / in_params.u[2], inp.n_steps);
+ in_params.c1[0] =
+ r[0] * (in_params.u[0] - pow(inp.fwd / inp.spot, 1.0 / inp.n_steps)) /
+ (in_params.u[0] - 1 / in_params.u[0]);
+ in_params.c1[1] =
+ r[1] *(in_params.u[1] - pow((inp.fwd / d_df) / inp.spot, 1.0 / inp.n_steps)) /
+ (in_params.u[1] - 1 / in_params.u[1]);
+ in_params.c1[2] =
+ r[0] * (in_params.u[2] - pow(inp.fwd / inp.spot, 1.0 / inp.n_steps)) /
+ (in_params.u[2] - 1 / in_params.u[2]);
+ in_params.c2[0] = r[0] - in_params.c1[0];
+ in_params.c2[1] = r[1] - in_params.c1[1];
+ in_params.c2[2] = r[0] - in_params.c1[2];
+
+ in_params.param_1[0] = inp.cp * in_params.umin[0];
+ in_params.param_1[1] = inp.cp * in_params.umin[1];
+ in_params.param_1[2] = inp.cp * in_params.umin[2];
+ in_params.param_2 = inp.cp * inp.strike;
+
+ return in_params;
+}
+
+CRRArrayEles PrepareArrData(const CRRInParams &in) {
+ CRRArrayEles arr;
+
+ // Write in reverse t-direction to match kernel access pattern
+ for (int i = 0; i <= in.n_steps + kOpt0; ++i) {
+ for (int inner_func_index = 0; inner_func_index < 3; ++inner_func_index) {
+ arr.array_eles[i][inner_func_index].u2 = pow(in.u2[inner_func_index], i);
+ arr.array_eles[i][inner_func_index].p1powu =
+ in.param_1[inner_func_index] * pow(in.u[inner_func_index], i + 1);
+ arr.array_eles[i][inner_func_index].init_optval =
+ fmax(in.param_1[inner_func_index] * pow(in.u2[inner_func_index], i) -
+ in.param_2, 0.0);
+ }
+ }
+
+ return arr;
+}
+
+// Metadata, used in the Kernel, is generated from the input data
+// Each CRR problem is split into 3 subproblems to calculate
+// each required option price separately
+void PrepareKernelData(vector &in_params,
+ vector &array_params,
+ vector &in_buff_params,
+ vector &in_buff2_params,
+ const int n_crrs) {
+
+ constexpr short offset = 0;
+
+ for (int wi_idx = offset, dst = offset * 3; wi_idx < n_crrs; ++wi_idx) {
+ CRRInParams &src_crr_params = in_params[wi_idx];
+
+ CRRArrayEles &src_crr_eles = array_params[wi_idx];
+
+ for (int inner_func_index = 0; inner_func_index < 3;
+ ++inner_func_index, ++dst) {
+ CRRMeta &dst_crr_meta = in_buff_params[dst];
+ CRRPerStepMeta &dst_crr_per_step_meta = in_buff2_params[dst];
+
+ dst_crr_meta.u = src_crr_params.u[inner_func_index];
+ dst_crr_meta.c1 = src_crr_params.c1[inner_func_index];
+ dst_crr_meta.c2 = src_crr_params.c2[inner_func_index];
+
+ dst_crr_meta.param_1 = src_crr_params.param_1[inner_func_index];
+ dst_crr_meta.param_2 = src_crr_params.param_2;
+
+ if (inner_func_index == 0) {
+ dst_crr_meta.n_steps = src_crr_params.n_steps + kOpt0;
+ } else {
+ dst_crr_meta.n_steps = src_crr_params.n_steps;
+ }
+ for (int i = 0; i <= kMaxNSteps2; ++i) {
+ dst_crr_per_step_meta.array_eles[i].u2 =
+ src_crr_eles.array_eles[i][inner_func_index].u2;
+ dst_crr_per_step_meta.array_eles[i].p1powu =
+ src_crr_eles.array_eles[i][inner_func_index].p1powu;
+ dst_crr_per_step_meta.array_eles[i].init_optval =
+ src_crr_eles.array_eles[i][inner_func_index].init_optval;
+ }
+ }
+ }
+}
+
+// Takes in the result from the kernel and stores the 3 option prices
+// belonging to the same CRR problem in one InterRes element
+void ProcessKernelResult(const vector &res_params,
+ vector &postp_buff, const int n_crrs) {
+ constexpr int offset = 0;
+
+ for (int wi_idx = offset, src = offset * 3; wi_idx < n_crrs; ++wi_idx) {
+ InterRes &dst_res = postp_buff[wi_idx];
+
+ for (int inner_func_index = 0; inner_func_index < 3;
+ ++inner_func_index, ++src) {
+ const CRRResParams &src_res = res_params[src];
+
+ for (int i = 0; i < 4; ++i) {
+ if (inner_func_index == 0) {
+ dst_res.pgreek[i] = src_res.pgreek[i];
+ }
+ }
+
+ dst_res.vals[inner_func_index] = src_res.optval0;
+ }
+ }
+}
+
+// Computes the Premium and Greeks
+OutputRes ComputeOutput(const InputData &inp, const CRRInParams &in_params,
+ const InterRes &res_params) {
+ double h;
+ OutputRes res;
+ h = inp.spot * (in_params.u2[0] - 1 / in_params.u2[0]);
+ res.value = res_params.pgreek[1];
+ res.delta = (res_params.pgreek[2] - res_params.pgreek[0]) / h;
+ res.gamma = 2 / h *
+ ((res_params.pgreek[2] - res_params.pgreek[1]) / inp.spot /
+ (in_params.u2[0] - 1) -
+ (res_params.pgreek[1] - res_params.pgreek[0]) / inp.spot /
+ (1 - (1 / in_params.u2[0])));
+ res.theta =
+ (res_params.vals[0] - res_params.pgreek[3]) / 4 / inp.t * inp.n_steps;
+ res.rho = (res_params.vals[1] - res.value) / kEpsilon;
+ res.vega = (res_params.vals[2] - res.value) / kEpsilon;
+ return res;
+}
+
+// Perform CRR solving using the CPU and compare FPGA resutls with CPU results
+// to test correctness.
+void TestCorrectness(int k, int n_crrs, bool &pass, const InputData &inp,
+ CRRInParams &vals, const OutputRes &fpga_res) {
+ if (k == 0) {
+ std::cout << "\n============= Correctness Test ============= \n";
+ std::cout << "Running analytical correctness checks... \n";
+ }
+
+ // This CRR benchmark ensures a minimum 4 decimal points match between FPGA and CPU
+ // "threshold" is chosen to enforce this guarantee
+ float threshold = 0.00001;
+ int i, j, q;
+ double x;
+ int n_steps = vals.n_steps;
+ int m = n_steps + kOpt0;
+ vector pvalue(kMaxNSteps3);
+ vector pvalue_1(kMaxNSteps1);
+ vector pvalue_2(kMaxNSteps1);
+ vector pgreek(5);
+ InterRes cpu_res_params;
+ OutputRes cpu_res;
+
+ // option value computed at each final node
+ x = vals.umin[0];
+ for (i = 0; i <= m; i++, x *= vals.u2[0]) {
+ pvalue[i] = fmax(inp.cp * (x - inp.strike), 0.0);
+ }
+
+ // backward recursion to evaluate option price
+ for (i = m - 1; i >= 0; i--) {
+ vals.umin[0] *= vals.u[0];
+ x = vals.umin[0];
+ for (j = 0; j <= i; j++, x *= vals.u2[0]) {
+ pvalue[j] = fmax(vals.c1[0] * pvalue[j] + vals.c2[0] * pvalue[j + 1],
+ inp.cp * (x - inp.strike));
+ }
+ if (i == 4) {
+ pgreek[4] = pvalue[2];
+ }
+ if (i == 2) {
+ for (q = 0; q <= 2; q++) {
+ pgreek[q + 1] = pvalue[q];
+ }
+ }
+ }
+ cpu_res_params.vals[0] = pvalue[0];
+
+ // the above computation is repeated for each option price
+ x = vals.umin[1];
+ for (i = 0; i <= n_steps; i++, x *= vals.u2[1]) {
+ pvalue_1[i] = fmax(inp.cp * (x - inp.strike), 0.0);
+ }
+
+ for (i = n_steps - 1; i >= 0; i--) {
+ vals.umin[1] *= vals.u[1];
+ x = vals.umin[1];
+
+ for (j = 0; j <= i; j++, x *= vals.u2[1]) {
+ pvalue_1[j] =
+ fmax(vals.c1[1] * pvalue_1[j] + vals.c2[1] * pvalue_1[j + 1],
+ inp.cp * (x - inp.strike));
+ }
+ }
+ cpu_res_params.vals[1] = pvalue_1[0];
+
+ x = vals.umin[2];
+ for (i = 0; i <= n_steps; i++, x *= vals.u2[2]) {
+ pvalue_2[i] = fmax(inp.cp * (x - inp.strike), 0.0);
+ }
+
+ for (i = n_steps - 1; i >= 0; i--) {
+ vals.umin[2] *= vals.u[2];
+ x = vals.umin[2];
+ for (j = 0; j <= i; j++, x *= vals.u2[2]) {
+ pvalue_2[j] =
+ fmax(vals.c1[2] * pvalue_2[j] + vals.c2[2] * pvalue_2[j + 1],
+ inp.cp * (x - inp.strike));
+ }
+ }
+ cpu_res_params.vals[2] = pvalue_2[0];
+ pgreek[0] = 0;
+
+ for (i = 1; i < 5; ++i) {
+ cpu_res_params.pgreek[i - 1] = pgreek[i];
+ }
+
+ cpu_res = ComputeOutput(inp, vals, cpu_res_params);
+
+ if (abs(cpu_res.value - fpga_res.value) > threshold) {
+ pass = false;
+ std::cout << "fpga_res.value " << k << " = " << std::fixed
+ << std::setprecision(20) << fpga_res.value << "\n";
+ std::cout << "cpu_res.value " << k << " = " << std::fixed
+ << std::setprecision(20) << cpu_res.value << "\n";
+ std::cout << "Mismatch detected for value of crr " << k << "\n";
+ }
+ if (abs(cpu_res.delta - fpga_res.delta) > threshold) {
+ pass = false;
+ std::cout << "fpga_res.delta " << k << " = " << std::fixed
+ << std::setprecision(20) << fpga_res.delta << "\n";
+ std::cout << "cpu_res.delta " << k << " = " << std::fixed
+ << std::setprecision(20) << cpu_res.delta << "\n";
+ std::cout << "Mismatch detected for value of crr " << k << "\n";
+ }
+ if (abs(cpu_res.gamma - fpga_res.gamma) > threshold) {
+ pass = false;
+ std::cout << "fpga_res.gamma " << k << " = " << std::fixed
+ << std::setprecision(20) << fpga_res.gamma << "\n";
+ std::cout << "cpu_res.gamma " << k << " = " << std::fixed
+ << std::setprecision(20) << cpu_res.gamma << "\n";
+ std::cout << "Mismatch detected for value of crr " << k << "\n";
+ }
+ if (abs(cpu_res.vega - fpga_res.vega) > threshold) {
+ pass = false;
+ std::cout << "fpga_res.vega " << k << " = " << std::fixed
+ << std::setprecision(20) << fpga_res.vega << "\n";
+ std::cout << "cpu_res.vega " << k << " = " << std::fixed
+ << std::setprecision(20) << cpu_res.vega << "\n";
+ std::cout << "Mismatch detected for value of crr " << k << "\n";
+ }
+ if (abs(cpu_res.theta - fpga_res.theta) > threshold) {
+ pass = false;
+ std::cout << "fpga_res.theta " << k << " = " << std::fixed
+ << std::setprecision(20) << fpga_res.theta << "\n";
+ std::cout << "cpu_res.theta " << k << " = " << std::fixed
+ << std::setprecision(20) << cpu_res.theta << "\n";
+ std::cout << "Mismatch detected for value of crr " << k << "\n";
+ }
+ if (abs(cpu_res.rho - fpga_res.rho) > threshold) {
+ pass = false;
+ std::cout << "fpga_res.rho " << k << " = " << std::fixed
+ << std::setprecision(20) << fpga_res.rho << "\n";
+ std::cout << "cpu_res.rho " << k << " = " << std::fixed
+ << std::setprecision(20) << cpu_res.rho << "\n";
+ std::cout << "Mismatch detected for value of crr " << k << "\n";
+ }
+
+ if (k == n_crrs - 1) {
+ std::cout << "CPU-FPGA Equivalence: " << (pass ? "PASS" : "FAIL") << "\n";
+ }
+}
+
+// Print out the achieved CRR throughput
+void TestThroughput(const double &time, const int &n_crrs) {
+ std::cout << "\n============= Throughput Test =============\n";
+
+ std::cout << " Avg throughput: " << std::fixed << std::setprecision(1)
+ << (n_crrs / time) << " assets/s\n";
+}
+
+int main(int argc, char *argv[]) {
+ string infilename = "";
+ string outfilename = "";
+
+ const string default_ifile = "src/data/ordered_inputs.csv";
+ const string default_ofile = "src/data/ordered_outputs.csv";
+
+ char str_buffer[kMaxStringLen] = {0};
+ for (int i = 1; i < argc; i++) {
+ if (argv[i][0] == '-') {
+ string sarg(argv[i]);
+
+ FindGetArgString(sarg, "-o=", str_buffer, kMaxStringLen);
+ FindGetArgString(sarg, "--output-file=", str_buffer, kMaxStringLen);
+ } else {
+ infilename = string(argv[i]);
+ }
+ }
+
+ try {
+#if defined(FPGA_EMULATOR)
+ intel::fpga_emulator_selector device_selector;
+#else
+ intel::fpga_selector device_selector;
+#endif
+
+ queue q(device_selector, dpc_common::exception_handler);
+
+ std::cout << "Running on device: "
+ << q.get_device().get_info().c_str() << "\n";
+
+ device device = q.get_device();
+ std::cout << "Device name: "
+ << device.get_info().c_str() << "\n \n \n";
+
+ vector inp;
+
+ // Get input file name, if users don't have their test input file, this
+ // design will use the default input file
+ if (infilename == "") {
+ infilename = default_ifile;
+ }
+ ifstream inputFile(infilename);
+
+ if (!inputFile.is_open()) {
+ std::cerr << "Input file doesn't exist \n";
+ return 1;
+ }
+
+ // Check input file format
+ string filename = infilename;
+ std::size_t found = filename.find_last_of(".");
+ if (!(filename.substr(found + 1).compare("csv") == 0)) {
+ std::cerr << "Input file format only support .csv\n";
+ return 1;
+ }
+
+ // Get output file name, if users don't define output file name, the design
+ // will use the default output file
+ outfilename = default_ofile;
+ if (strlen(str_buffer)) {
+ outfilename = string(str_buffer);
+ }
+
+ // Check output file format
+ filename = outfilename;
+ found = filename.find_last_of(".");
+ if (!(filename.substr(found + 1).compare("csv") == 0)) {
+ std::cerr << "Output file format only support .csv\n";
+ return 1;
+ }
+
+ // Read inputs data from input file
+ ReadInputFromFile(inputFile, inp);
+
+// Get the number of data from the input file
+// Emulator mode only goes through one input (or through OUTER_UNROLL inputs) to
+// ensure fast runtime
+#if defined(FPGA_EMULATOR)
+ int temp_crrs = 1;
+#else
+ int temp_crrs = inp.size();
+#endif
+
+ // Check if n_crrs >= OUTER_UNROLL
+ if (OUTER_UNROLL >= temp_crrs) {
+ if (inp.size() < OUTER_UNROLL) {
+ std::cerr << "Input size must be greater than or equal to OUTER_UNROLL\n";
+ return 1;
+ } else {
+ temp_crrs = OUTER_UNROLL;
+ }
+ }
+
+ const int n_crrs = temp_crrs;
+
+ vector in_params(n_crrs);
+ vector array_params(n_crrs);
+
+ for (int j = 0; j < n_crrs; ++j) {
+ in_params[j] = PrepareData(inp[j]);
+ array_params[j] = PrepareArrData(in_params[j]);
+ }
+
+ // following vectors are arguments for CrrSolver
+ vector in_buff_params(n_crrs * 3);
+ vector in_buff2_params(n_crrs * 3);
+
+ vector res_params(n_crrs * 3);
+ vector res_params_dummy(n_crrs * 3);
+
+ // Prepare metadata as input to kernel
+ PrepareKernelData(in_params, array_params, in_buff_params, in_buff2_params,
+ n_crrs);
+
+ // warmup run - use this run to warmup accelerator
+ CrrSolver(n_crrs, in_buff_params, res_params_dummy, in_buff2_params,
+ q);
+ // Timed run - profile performance
+ double time = CrrSolver(n_crrs, in_buff_params, res_params,
+ in_buff2_params, q);
+ bool pass = true;
+
+ // Postprocessing step
+ // process_res used to compute final results
+ vector process_res(n_crrs);
+ ProcessKernelResult(res_params, process_res, n_crrs);
+
+ vector result(n_crrs);
+ for (int i = 0; i < n_crrs; ++i) {
+ result[i] = ComputeOutput(inp[i], in_params[i], process_res[i]);
+ TestCorrectness(i, n_crrs, pass, inp[i], in_params[i], result[i]);
+ }
+
+ // Write outputs data to output file
+ ofstream outputFile(outfilename);
+
+ WriteOutputToFile(outputFile, result);
+
+ TestThroughput(time, n_crrs);
+
+ } catch (sycl::exception const &e) {
+ std::cout << "Caught a synchronous SYCL exception: " << e.what() << "\n";
+ std::cout << " If you are targeting an FPGA hardware, "
+ "ensure that your system is plugged to an FPGA board that is "
+ "set up correctly\n";
+ std::cout << " If you are targeting the FPGA emulator, compile with "
+ "-DFPGA_EMULATOR\n";
+ return 1;
+ }
+ return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/CMakeLists.txt
new file mode 100755
index 0000000000..9ac77b0aff
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(GZip)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/README.md b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/README.md
new file mode 100755
index 0000000000..18117a82a5
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/README.md
@@ -0,0 +1,201 @@
+# GZIP Compression
+Reference design demonstrating high-performance GZIP compression on FPGA.
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. Additional reference material specific to this GZIP implementation is provided in the References section of this README.
+
+| Optimized for | Description
+--- |---
+| OS | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit
+| What you will learn | How to implement a high performance multi-engine compression algorithm on FPGA
+| Time to complete | 1 hr (not including compile time)
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+**Performance**
+Please refer to performance disclaimer at the end of this README.
+
+| Device | Throughput
+|:--- |:---
+| Intel® PAC with Intel Arria® 10 GX FPGA | 1 engine @ 3.4 GB/s
+| Intel® PAC with Intel Stratix® 10 SX FPGA | 2 engines @ 5.5 GB/s each = 11.0 GB/s total
+
+
+## Purpose
+
+This DPC++ reference design implements a compression algorithm. The implementation is optimized for the FPGA device. The compression result is GZIP-compatible and can be decompressed with GUNZIP. The GZIP output file format is compatible with GZIP's DEFLATE algorithm, and follows a fixed subset of [RFC 1951](https://www.ietf.org/rfc/rfc1951.txt). See the References section for more specific references.
+
+The algorithm uses a GZIP-compatible Limpel-Ziv 77 (LZ77) algorithm for data de-duplication, and a GZIP-compatible Static Huffman algorithm for bit reduction. The implementation includes three FPGA accelerated tasks (LZ77, Static Huffman and CRC).
+
+The FPGA implementation of the algorithm enables either one or two independent GZIP compute engines to operate in parallel on the FPGA. The number of engines is constrained by the available FPGA resources. By default, the design is parameterized to create a single engine when the design is compiled targeting Intel® PAC with Intel Arria® 10 GX FPGA. Two engines are created when targeting Intel® PAC with Intel Stratix® 10 SX FPGA, a larger device.
+
+## Key Implementation Details
+
+ | Kernel | Description
+--- |---
+| LZ Reduction | Implements a LZ77 algorithm for data de-duplication. The algorithm produces distance and length information that is compatible with GZIP's DEFLATE implementation.
+| Static Huffman | Uses the same Static Huffman codes used by GZIP's DEFLATE algorithm when it chooses a Static Huffman coding scheme for bit reduction. This choice maintains compatibility with GUNZIP.
+| CRC | Adds a CRC checksum based on the input file; this is required by the gzip file format
+
+To optimize performance, GZIP leverages techniques discussed in the following FPGA tutorials:
+* **Double Buffering to Overlap Kernel Execution with Buffer Transfers and Host Processing** (double_buffering)
+* **On-Chip Memory Attributes** (mem_config)
+
+
+## License
+This code sample is licensed under MIT license.
+
+
+## Building the `gzip` Reference Design
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 24h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+ ```
+ mkdir build
+ cd build
+ ```
+ To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:
+ ```
+ cmake ..
+ ```
+ Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+ ```
+ cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+ ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ make fpga_emu
+ ```
+ * Generate the optimization report:
+ ```
+ make report
+ ```
+ * Compile for FPGA hardware (longer compile time, targets FPGA device):
+ ```
+ make fpga
+ ```
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead.
+
+1. Enter the source file directory.
+ ```
+ cd src
+ ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ ninja fpga_emu
+ ```
+
+ * Generate the optimization report:
+
+ ```
+ ninja report
+ ```
+ If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+ ```
+ ninja report_s10_pac
+ ```
+ * Compiling for FPGA hardware is not yet supported on Windows.
+
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+
+## Running the Reference Design
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+ ```
+ ./gzip.fpga_emu [-o=] (Linux)
+ gzip.fpga_emu.exe [-o=] (Windows)
+ ```
+2. Run the sample on the FPGA device:
+ ```
+ ./gzip.fpga [-o=] (Linux)
+ ```
+ ### Application Parameters
+
+| Argument | Description
+--- |---
+| `` | Mandatory argument that specifies the file to be compressed. Use a 120+ MB file to achieve peak performance.
+| `-o=` | Optional argument that specifies the name of the output file. The default name of the output file is `.gz`. When targeting Intel Stratix® 10 SX, the single `` is fed to both engines, yielding two identical output files, using `` as the basis for the filenames.
+
+### Example of Output
+
+```
+Running on device: pac_a10 : Intel PAC Platform (pac_ee00000)
+Throughput: 3.4321 GB/s
+Compression Ratio 33.2737%
+PASSED
+```
+## Additional Design Information
+### Source Code Explanation
+
+| File | Description
+--- |---
+| `gzip.cpp` | Contains the `main()` function and the top-level interfaces to the SYCL* GZIP functions.
+| `gzipkernel.cpp` | Contains the SYCL* kernels used to implement GZIP.
+| `CompareGzip.cpp` | Contains code to compare a GZIP-compatible file with the original input.
+| `WriteGzip.cpp` | Contains code to write a GZIP compatible file.
+| `crc32.cpp` | Contains code to calculate a 32-bit CRC that is compatible with the GZIP file format and to combine multiple 32-bit CRC values. It is used to account only for the CRC of the last few bytes in the file, which are not processed by the accelerated CRC kernel.
+| `kernels.hpp` | Contains miscellaneous defines and structure definitions required by the LZReduction and Static Huffman kernels.
+| `crc32.hpp` | Header file for `crc32.cpp`.
+| `gzipkernel.hpp` | Header file for `gzipkernels.cpp`.
+| `CompareGzip.hpp` | Header file for `CompareGzip.cpp`.
+| `WriteGzip.hpp` | Header file for `WriteGzip.cpp`.
+
+### Compiler Flags Used
+
+| Flag | Description
+--- |---
+`-Xshardware` | Target FPGA hardware (as opposed to FPGA emulator)
+`-Xsparallel=2` | Uses 2 cores when compiling the bitstream through Quartus
+`-Xsseed=1` | Uses seed 1 during Quartus, yields slightly higher fmax
+`-Xsnum-reorder=6` | On Intel Stratix® 10 SX only, specify a wider data path for read data from global memory
+`-DNUM_ENGINES=<1|2>` | Specifies that 1 GZIP engine should be compiled when targeting Arria® 10 GX and 2 engines when targeting Intel Stratix® 10 SX
+
+
+### Performance disclaimers
+
+Tests document performance of components on a particular test, in specific systems. Differences in hardware, software, or configuration will affect actual performance. Consult other sources of information to evaluate performance as you consider your purchase. For more complete information about performance and benchmark results, visit [www.intel.com/benchmarks](www.intel.com/benchmarks).
+
+Performance results are based on testing as of July 29, 2020 and may not reflect all publicly available security updates. See configuration disclosure for details. No product or component can be absolutely secure.
+
+Intel technologies’ features and benefits depend on system configuration and may require enabled hardware, software or service activation. Performance varies depending on system configuration. Check with your system manufacturer or retailer or learn more at [intel.com](www.intel.com).
+
+The performance was measured by Intel on July 29, 2020
+
+Intel and the Intel logo are trademarks of Intel Corporation or its subsidiaries in the U.S. and/or other countries.
+
+(C) Intel Corporation.
+
+### References
+[Khronous SYCL Resources](https://www.khronos.org/sycl/resources)
+
+[Intel GZIP OpenCL Design Example](https://www.intel.com/content/www/us/en/programmable/support/support-resources/design-examples/design-software/opencl/gzip-compression.html)
+
+[RFC 1951 - DEFLATE Data Format](https://www.ietf.org/rfc/rfc1951.txt)
+
+[RFC 1952 - GZIP Specification 4.3](https://www.ietf.org/rfc/rfc1952.txt)
+
+[OpenCL Intercept Layer](https://github.com/intel/opencl-intercept-layer)
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/Zlib_License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/Zlib_License.txt
new file mode 100755
index 0000000000..a75dd96a90
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/Zlib_License.txt
@@ -0,0 +1,25 @@
+zlib License
+
+ zlib.h -- interface of the 'zlib' general purpose compression library
+ version 1.2.11, January 15th, 2017
+
+ Copyright (C) 1995-2017 Jean-loup Gailly and Mark Adler
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+
+ Jean-loup Gailly Mark Adler
+ jloup@gzip.org madler@alumni.caltech.edu
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.sln b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.sln
new file mode 100755
index 0000000000..580f35f08b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.sln
@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gzip", "gzip.vcxproj", "{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|x64 = Debug|x64
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Debug|x64.ActiveCfg = Debug|x64
+ {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Debug|x64.Build.0 = Debug|x64
+ {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Release|x64.ActiveCfg = Release|x64
+ {CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ SolutionGuid = {92BEFAAB-0365-4E5A-9C4A-E50AB49B2A6B}
+ EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj
new file mode 100755
index 0000000000..cf6a2462d2
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj
@@ -0,0 +1,174 @@
+
+
+
+
+ Debug
+ x64
+
+
+ Release
+ x64
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 15.0
+ {cf6a576b-665d-4f24-bb62-0dae7a7b3c64}
+ Win32Proj
+ gzip
+ $(WindowsSDKVersion.Replace("\",""))
+
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true
+
+
+ true
+
+
+ false
+
+
+ false
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR %(AdditionalOptions)
+
+
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR %(AdditionalOptions)
+
+
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj.user b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj.user
new file mode 100755
index 0000000000..1956841792
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj.user
@@ -0,0 +1,14 @@
+
+
+
+ false
+
+
+ src/gzip.cpp -o=test.gz
+ WindowsLocalDebugger
+
+
+ src/gzip.cpp -o=test.gz
+ WindowsLocalDebugger
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/sample.json b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/sample.json
new file mode 100755
index 0000000000..a6d65ecd17
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/sample.json
@@ -0,0 +1,51 @@
+{
+ "guid": "D55081EB-669D-4832-BCE6-23EE2ACA9F0F",
+ "name": "GZIP Compression",
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Reference Designs"],
+ "description": "Reference design demonstrating high-performance GZIP compression on FPGA",
+ "toolchain": ["dpcpp"],
+ "os": ["linux", "windows"],
+ "builder": ["ide", "cmake"],
+ "targetDevice": ["FPGA"],
+ "languages": [{"cpp":{}}],
+ "ciTests": {
+ "linux": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make fpga_emu",
+ "./gzip.fpga_emu ../src/gzip.cpp -o=test.gz"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make report"
+ ]
+ }
+ ],
+ "windows": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "cd src",
+ "ninja fpga_emu",
+ "gzip.fpga_emu.exe ../src/gzip.cpp -o=test.gz"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "cd src",
+ "ninja report"
+ ]
+ }
+ ]
+ }
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CMakeLists.txt
new file mode 100755
index 0000000000..bf6125045f
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CMakeLists.txt
@@ -0,0 +1,125 @@
+set(DEVICE_SOURCE_FILE gzipkernel.cpp)
+set(DEVICE_HEADER_FILE gzipkernel.hpp)
+set(HOST_SOURCE_FILE gzip.cpp crc32.cpp WriteGzip.cpp CompareGzip.cpp)
+
+set(TARGET_NAME gzip)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+set(REPORTS_TARGET ${TARGET_NAME}_report)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Design specific constant values
+
+# To increase NUM_ENGINES to greater than 2, must also statically declare more engines in gzipkernel.cpp --> SubmitGzipTasks()
+set(NUM_ENGINES_A10 1)
+set(NUM_ENGINES_S10 2)
+set(NUM_REORDER "")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+SET(NUM_ENGINES ${NUM_ENGINES_A10})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+ MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+ MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+ MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+ SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+ SET(NUM_ENGINES ${NUM_ENGINES_S10})
+ set(NUM_REORDER "-Xsnum-reorder=6")
+
+ELSE()
+ MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+#specify -MMD -fsycl-link-targets=... instead of -fintelfpga to workaround known issue; lower report quality
+set(HARDWARE_COMPILE_FLAGS -MMD -fsycl-link-targets=spir64_fpga-unknown-unknown-sycldevice -c -DNUM_ENGINES=${NUM_ENGINES})
+
+# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation
+separate_arguments(USER_HARDWARE_FLAGS)
+set(HARDWARE_LINK_FLAGS -fintelfpga -Xshardware -Xsparallel=2 -Xsseed=1 ${NUM_REORDER} -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS} -DNUM_ENGINES=${NUM_ENGINES})
+set(FINAL_LINK_FLAGS -fintelfpga -DNUM_ENGINES=${NUM_ENGINES})
+
+set(EMULATOR_COMPILE_FLAGS "-v -v -v -g0 -fintelfpga -DFPGA_EMULATOR -DNUM_ENGINES=${NUM_ENGINES}")
+set(EMULATOR_LINK_FLAGS -fintelfpga)
+
+# fpga emulator
+if(WIN32)
+ set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+ add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+ separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+ add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} ${CMAKE_CURRENT_SOURCE_DIR}/gzip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/crc32.cpp ${CMAKE_CURRENT_SOURCE_DIR}/WriteGzip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/CompareGzip.cpp -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+ DEPENDS ${DEVICE_SOURCE_FILE} ${HOST_SOURCE_FILE})
+else()
+ add_executable(${EMULATOR_TARGET} ${DEVICE_SOURCE_FILE} ${HOST_SOURCE_FILE})
+ add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+ set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+ set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpga
+if(WIN32)
+ add_custom_target(fpga
+ COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+ add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+ set(DEVICE_FPGA_OBJ "gzipkernel_fpga.o")
+ set(DEVICE_IMAGE_FPGA_OBJ "gzipkernel_fpga.a")
+ set(HOST_SOURCE_FILES_WITH_PATH ${CMAKE_CURRENT_SOURCE_DIR}/gzip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/crc32.cpp ${CMAKE_CURRENT_SOURCE_DIR}/WriteGzip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/CompareGzip.cpp)
+
+ add_custom_command(OUTPUT ${DEVICE_FPGA_OBJ}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${DEVICE_FPGA_OBJ}
+ DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE})
+
+ set(OBJ_FILES)
+ foreach(HOST_FILE ${HOST_SOURCE_FILES_WITH_PATH})
+ set(HOST_FPGA_OBJ ${HOST_FILE}.o)
+ add_custom_command(OUTPUT ${HOST_FPGA_OBJ}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${HOST_FILE} -o ${HOST_FPGA_OBJ}
+ DEPENDS ${HOST_FILE})
+ list(APPEND OBJ_FILES ${HOST_FPGA_OBJ})
+ endforeach()
+
+ add_custom_command(OUTPUT ${DEVICE_IMAGE_FPGA_OBJ}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link=image ${DEVICE_FPGA_OBJ} -o ${DEVICE_IMAGE_FPGA_OBJ}
+ DEPENDS ${DEVICE_FPGA_OBJ} ${OBJ_FILES})
+
+ add_custom_command(OUTPUT ${FPGA_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${FINAL_LINK_FLAGS} ${OBJ_FILES} ${DEVICE_IMAGE_FPGA_OBJ} -o ${CMAKE_BINARY_DIR}/${FPGA_TARGET}
+ DEPENDS ${DEVICE_IMAGE_FPGA_OBJ} ${OBJ_FILES})
+endif()
+
+# fpga report
+if(WIN32)
+ add_custom_target(report DEPENDS ${REPORTS_TARGET} )
+
+ separate_arguments(WIN_FLAGS WINDOWS_COMMAND)
+ add_custom_command(OUTPUT ${REPORTS_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${WIN_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET}
+ DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE})
+
+else()
+ add_custom_target(report DEPENDS ${REPORTS_TARGET} )
+
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} ${DEVICE_SOURCE_FILE} COPYONLY)
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/kernels.hpp kernels.hpp COPYONLY)
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_HEADER_FILE} ${DEVICE_HEADER_FILE} COPYONLY)
+
+ add_custom_command(OUTPUT ${REPORTS_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${DEVICE_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET}
+ DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE} kernels.hpp)
+endif()
+
+# run
+add_custom_target(run
+ COMMAND ../${TARGET_NAME}.fpga_emu Makefile -o=test.gz
+ DEPENDS ${TARGET_NAME}.fpga_emu)
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.cpp
new file mode 100755
index 0000000000..b803dee96b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.cpp
@@ -0,0 +1,85 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#include "CompareGzip.hpp"
+
+// returns 0 on success, otherwise failure
+int CompareGzipFiles(
+ const std::string
+ &original_file, // original input file to compare gzip uncompressed
+ const std::string &input_gzfile) // gzip file to check
+{
+#ifdef _MSC_VER
+ std::cout
+ << "Info: skipping output verification on Windows, no builtin gunzip\n";
+ return 0;
+#else
+ //------------------------------------------------------------------
+ // assume all good to start with.
+
+ int gzipstatus = 0;
+
+ //------------------------------------------------------------------
+ // Create temporary output filename for gunzip
+
+ char tmp_name[] = "/tmp/gzip_fpga.XXXXXX";
+ mkstemp(tmp_name);
+ std::string outputfile = tmp_name;
+
+ //------------------------------------------------------------------
+ // Check that the original file and gzipped file exist.
+
+ //------------------------------------------------------------------
+ // gunzip the file produced to stdout, capturing to the temp file.
+
+ std::string cmd = "gunzip -c ";
+ cmd += input_gzfile;
+ cmd += " > " + outputfile;
+
+ int gzout = ::system(cmd.c_str());
+ if (gzout != 0) {
+ gzipstatus = 3;
+ }
+
+ //------------------------------------------------------------------
+ // diff the temp file and the original.
+
+ cmd = "diff -q " + outputfile + " " + original_file;
+ int diffout = ::system(cmd.c_str());
+ if (diffout != 0) {
+ gzipstatus = 4;
+ }
+
+ //------------------------------------------------------------------
+ // Cleanup, remove the temp file.
+
+ (void)::remove(outputfile.c_str());
+
+ return gzipstatus;
+#endif
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.hpp
new file mode 100755
index 0000000000..5624b97cea
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.hpp
@@ -0,0 +1,41 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#ifndef __COMPAREGZIP_H__
+#define __COMPAREGZIP_H__
+#pragma once
+
+#include
+#include
+
+int CompareGzipFiles(
+ const std::string
+ &original_file, // original input file to compare gzip uncompressed
+ const std::string &input_gzfile); // gzip file to check
+
+#endif //__COMPAREGZIP_H__
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.cpp
new file mode 100755
index 0000000000..71c370aa96
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.cpp
@@ -0,0 +1,163 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#define _CRT_SECURE_NO_WARNINGS
+#include "WriteGzip.hpp"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+constexpr int kDeflated = 8;
+#define GZIP_MAGIC "\037\213" // Magic header for gzip files, 1F 8B
+
+#define ORIG_NAME 0x08
+#define OS_CODE 0x03 // Unix OS_CODE
+
+typedef struct GzipHeader {
+ unsigned char magic[2]; // 0x1f, 0x8b
+ unsigned char compress_method; // 0-7 reserved, 8=deflate -- kDeflated
+ unsigned char flags; // b0: file probably ascii
+ // b1: header crc-16 present
+ // b2: extra field present
+ // b3: original file name present
+ // b4: file comment present
+ // b5,6,7: reserved
+ unsigned long time; // file modification time in Unix format.
+ // Set this to 0 for now.
+
+ unsigned char extra; // depends on compression method
+ unsigned char os; // operating system on which compression took place
+
+ // ...
+ // ? bytes ... compressd data ...
+
+ unsigned long crc;
+ unsigned long uncompressed_sz;
+
+} gzip_header, *pgzip_header;
+
+inline static void PutUlong(uint8_t *pc, unsigned long l) {
+ pc[0] = l & 0xff;
+ pc[1] = (l >> 8) & 0xff;
+ pc[2] = (l >> 16) & 0xff;
+ pc[3] = (l >> 24) & 0xff;
+}
+
+// returns 0 on success, otherwise failure
+int WriteBlockGzip(
+ std::string &original_filename, // Original file name being compressed
+ std::string &out_filename, // gzip filename
+ char *obuf, // pointer to compressed data block
+ size_t blen, // length of compressed data block
+ size_t ilen, // original block length
+ uint32_t buffer_crc) // the block's crc
+{
+ //------------------------------------------------------------------
+ // Setup the gzip output file header.
+ // max filename size is arbitrarily set to 256 bytes long
+ // Method is always DEFLATE
+ // Original filename is always set in header
+ // timestamp is set to 0 - ignored by gunzip
+ // deflate flags set to 0
+ // OS code is 0
+
+ int max_filename_sz = 256;
+
+ unsigned char *pgziphdr =
+ (unsigned char *)malloc(sizeof(gzip_header) + max_filename_sz);
+
+ if (!pgziphdr) {
+ std::cout << "pgzip header cannot be allocated\n";
+ return 1;
+ }
+
+ pgziphdr[0] = GZIP_MAGIC[0];
+ pgziphdr[1] = GZIP_MAGIC[1];
+ pgziphdr[2] = kDeflated;
+ pgziphdr[3] = ORIG_NAME;
+
+ // Set time in header to 0, this is ignored by gunzip.
+ pgziphdr[4] = 0;
+ pgziphdr[5] = 0;
+ pgziphdr[6] = 0;
+ pgziphdr[7] = 0;
+
+ // Deflate flags
+ pgziphdr[8] = 0;
+
+ // OS code is Linux in this case.
+ pgziphdr[9] = OS_CODE;
+
+ int ondx = 10;
+
+ const char *p = original_filename.c_str();
+ do {
+ pgziphdr[ondx++] = (*p);
+ } while (*p++);
+
+ int header_bytes = ondx;
+
+ unsigned char prolog[8];
+
+ PutUlong(((unsigned char *)prolog), buffer_crc);
+ PutUlong(((unsigned char *)&prolog[4]), ilen);
+
+ FILE *fo = fopen(out_filename.c_str(), "w+");
+ if (ferror(fo)) {
+ std::cout << "Cannot open file for output: " << out_filename << "\n";
+ free(pgziphdr);
+ return 1;
+ }
+
+ fwrite(pgziphdr, 1, header_bytes, fo);
+ fwrite(obuf, 1, blen, fo);
+ fwrite(prolog, 1, 8, fo);
+
+ if (ferror(fo)) {
+ std::cout << "gzip output file write failure.\n";
+ free(pgziphdr);
+ return 1;
+ }
+
+ if (fclose(fo)) {
+ perror("close");
+ free(pgziphdr);
+ return 1;
+ }
+ free(pgziphdr);
+ return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.hpp
new file mode 100755
index 0000000000..66bc28e315
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.hpp
@@ -0,0 +1,45 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#ifndef __WRITEGZIP_H__
+#define __WRITEGZIP_H__
+#pragma once
+
+#include
+#include
+
+// returns 0 on success, otherwise failure
+int WriteBlockGzip(
+ std::string &original_filename, // Original file name being compressed
+ std::string &out_filename, // gzip filename
+ char *obuf, // pointer to compressed data block
+ size_t blen, // length of compressed data block
+ size_t ilen, // original block length
+ uint32_t buffer_crc); // the block's crc
+
+#endif //__WRITEGZIP_H__
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/build.ninja b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/build.ninja
new file mode 100755
index 0000000000..29d50e63a0
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/build.ninja
@@ -0,0 +1,32 @@
+device_source_file = gzipkernel.cpp
+device_header_file = gzipkernel.h
+host_source_file = gzip.cpp crc32.cpp WriteGzip.cpp CompareGzip.cpp
+target_name = gzip
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware -Xsclock=280MHz -Xsparallel=2 -Xsseed=1
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+ command = dpcpp /GX ${emulator_flags} ${device_source_file} ${host_source_file} -o $out
+
+rule gen_report
+ command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 ${device_source_file} ${host_source_file} -fsycl-link -o $out
+
+rule gen_report_s10_pac
+ command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 ${device_source_file} ${host_source_file} -fsycl-link -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.cpp
new file mode 100755
index 0000000000..8e6c59c734
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.cpp
@@ -0,0 +1,126 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+/*
+ * Copyright (C) 1995-2006, 2010, 2011, 2012, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "crc32.hpp"
+
+// This table is CRC32s for all single byte values created by using the
+// makecrc.c utility from gzip for compatibility with gzip. makecrc.c can be
+// found in the gzip source code project found at
+// https://git.savannah.gnu.org/git/gzip.git. The polynomial 0xedb88320 is used
+// for gzip, and thus used to create this table.
+//
+// Not copyrighted 1990, Mark Adler.
+//
+const unsigned int crc32_table[] = {
+ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L,
+ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L,
+ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L,
+ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL,
+ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L,
+ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L,
+ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L,
+ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL,
+ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L,
+ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL,
+ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L,
+ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L,
+ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L,
+ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL,
+ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL,
+ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L,
+ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL,
+ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L,
+ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L,
+ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L,
+ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL,
+ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L,
+ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L,
+ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL,
+ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L,
+ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L,
+ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L,
+ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L,
+ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L,
+ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL,
+ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL,
+ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L,
+ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L,
+ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL,
+ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL,
+ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L,
+ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL,
+ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L,
+ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL,
+ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L,
+ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL,
+ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L,
+ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L,
+ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL,
+ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L,
+ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L,
+ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L,
+ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L,
+ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L,
+ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L,
+ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
+ 0x2d02ef8dL};
+
+//
+// This routine creates a Crc32 from a memory buffer (address, and length), and
+// a previous crc. This routine can be called iteratively on different portions
+// of the same buffer, using a previously returned crc value. The
+// value 0xffffffff is used for the first buffer invocation.
+unsigned int Crc32Host(
+ const char *pbuf, // pointer to the buffer to crc
+ size_t sz, // number of bytes
+ unsigned int previous_crc) // previous CRC, allows combining.
+{
+ unsigned int curr_crc = ~previous_crc;
+ if (sz) do {
+ curr_crc =
+ crc32_table[((int)curr_crc ^ (*pbuf++)) & 0xff] ^ (curr_crc >> 8);
+ } while (--sz);
+ return curr_crc ^ 0xffffffffL;
+}
+
+unsigned int Crc32(const char *in, size_t buffer_sz,
+ unsigned int previous_crc) {
+ const int num_nibbles_parallel = 64;
+ const int num_sections =
+ buffer_sz / (num_nibbles_parallel / 2); // how many loop iterations
+ // now deal with the remainder, this should be done on the software host
+ // the post-invert also happens inside crc_reference
+ const char *remaining_data = &in[num_sections * (num_nibbles_parallel / 2)];
+ int remaining_bytes = buffer_sz % (num_nibbles_parallel / 2);
+ return Crc32Host(remaining_data, remaining_bytes, previous_crc);
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.hpp
new file mode 100755
index 0000000000..138a8f0754
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.hpp
@@ -0,0 +1,46 @@
+// ==============================================================
+// Copyright Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#ifndef __CRC32_H__
+#define __CRC32_H__
+#pragma once
+
+#include
+#include
+
+uint32_t Crc32Host(
+ const char *pbuf, // pointer to the buffer to crc
+ size_t sz, // number of bytes
+ uint32_t previous_crc); // previous CRC, allows combining. First invocation
+ // would use 0xffffffff.
+uint32_t Crc32(const char *pbuf, // pointer to the buffer to crc
+ size_t sz, // number of bytes
+ uint32_t previous_crc); // previous CRC, allows combining. First
+ // invocation would use 0xffffffff.
+
+#endif //__CRC32_H__
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzip.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzip.cpp
new file mode 100755
index 0000000000..9ecfe11728
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzip.cpp
@@ -0,0 +1,520 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#include
+#include
+#include
+#include
+#include
+
+#include "CompareGzip.hpp"
+#include "WriteGzip.hpp"
+#include "crc32.hpp"
+#include "dpc_common.hpp"
+#include "gzipkernel.hpp"
+#include "kernels.hpp"
+
+using namespace sycl;
+
+// The minimum file size of a file to be compressed.
+// Any filesize less than this results in an error.
+constexpr int minimum_filesize = kVec + 1;
+
+bool help = false;
+
+int CompressFile(queue &q, std::string &input_file, std::vector outfilenames,
+ int iterations, bool report);
+
+void Help(void) {
+ // Command line arguments.
+ // gzip [options] filetozip [options]
+ // -h,--help : help
+
+ // future options?
+ // -p,performance : output perf metrics
+ // -m,maxmapping=# : maximum mapping size
+
+ std::cout << "gzip filename [options]\n";
+ std::cout << " -h,--help : this help text\n";
+ std::cout
+ << " -o=,--output-file= : specify output file\n";
+}
+
+bool FindGetArg(std::string &arg, const char *str, int defaultval, int *val) {
+ std::size_t found = arg.find(str, 0, strlen(str));
+ if (found != std::string::npos) {
+ int value = atoi(&arg.c_str()[strlen(str)]);
+ *val = value;
+ return true;
+ }
+ return false;
+}
+
+constexpr int kMaxStringLen = 40;
+
+bool FindGetArgString(std::string &arg, const char *str, char *str_value,
+ size_t maxchars) {
+ std::size_t found = arg.find(str, 0, strlen(str));
+ if (found != std::string::npos) {
+ const char *sptr = &arg.c_str()[strlen(str)];
+ for (int i = 0; i < maxchars - 1; i++) {
+ char ch = sptr[i];
+ switch (ch) {
+ case ' ':
+ case '\t':
+ case '\0':
+ str_value[i] = 0;
+ return true;
+ break;
+ default:
+ str_value[i] = ch;
+ break;
+ }
+ }
+ return true;
+ }
+ return false;
+}
+
+size_t SyclGetExecTimeNs(event e) {
+ size_t start_time =
+ e.get_profiling_info();
+ size_t end_time =
+ e.get_profiling_info();
+ return (end_time - start_time);
+}
+
+int main(int argc, char *argv[]) {
+ std::string infilename = "";
+
+ std::vector outfilenames (kNumEngines);
+
+ char str_buffer[kMaxStringLen] = {0};
+
+ // Check the number of arguments specified
+ if (argc != 3) {
+ std::cerr << "Incorrect number of arguments. Correct usage: " << argv[0]
+ << " -o=\n";
+ return 1;
+ }
+
+ for (int i = 1; i < argc; i++) {
+ if (argv[i][0] == '-') {
+ std::string sarg(argv[i]);
+ if (std::string(argv[i]) == "-h") {
+ help = true;
+ }
+ if (std::string(argv[i]) == "--help") {
+ help = true;
+ }
+
+ FindGetArgString(sarg, "-o=", str_buffer, kMaxStringLen);
+ FindGetArgString(sarg, "--output-file=", str_buffer, kMaxStringLen);
+ } else {
+ infilename = std::string(argv[i]);
+ }
+ }
+
+ if (help) {
+ Help();
+ return 1;
+ }
+
+ try {
+#ifdef FPGA_EMULATOR
+ intel::fpga_emulator_selector device_selector;
+#else
+ intel::fpga_selector device_selector;
+#endif
+ auto prop_list = property_list{property::queue::enable_profiling()};
+ queue q(device_selector, dpc_common::exception_handler, prop_list);
+
+ std::cout << "Running on device: "
+ << q.get_device().get_info().c_str() << "\n";
+
+ if (infilename == "") {
+ std::cout << "Must specify a filename to compress\n\n";
+ Help();
+ return 1;
+ }
+
+ // next, check valid and acceptable parameter ranges.
+ // if output filename not set, use the default
+ // name, else use the name specified by the user
+ outfilenames[0] = std::string(infilename) + ".gz";
+ if (strlen(str_buffer)) {
+ outfilenames[0] = std::string(str_buffer);
+ }
+ for (size_t i=1; i< kNumEngines; i++) {
+ // Filenames will be of the form outfilename, outfilename2, outfilename3 etc.
+ outfilenames[i] = outfilenames[0] + std::to_string(i+1);
+ }
+
+ std::cout << "Launching GZIP application with " << kNumEngines
+ << " engines\n";
+
+#ifdef FPGA_EMULATOR
+ CompressFile(q, infilename, outfilenames, 1, true);
+#else
+ // warmup run - use this run to warmup accelerator. There are some steps in
+ // the runtime that are only executed on the first kernel invocation but not
+ // on subsequent invocations. So execute all that stuff here before we
+ // measure performance (in the next call to CompressFile().
+ CompressFile(q, infilename, outfilenames, 1, false);
+ // profile performance
+ CompressFile(q, infilename, outfilenames, 200, true);
+#endif
+ } catch (sycl::exception const &e) {
+ // Catches exceptions in the host code
+ std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+
+ // Most likely the runtime couldn't find FPGA hardware!
+ if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+ std::cout << "If you are targeting an FPGA, please ensure that your "
+ "system has a correctly configured FPGA board.\n";
+ std::cout << "If you are targeting the FPGA emulator, compile with "
+ "-DFPGA_EMULATOR.\n";
+ }
+ std::terminate();
+ }
+ return 0;
+}
+
+struct KernelInfo {
+ buffer *gzip_out_buf;
+ buffer *current_crc;
+ buffer *pobuf;
+ buffer *pibuf;
+ char *pobuf_decompress;
+
+ uint32_t buffer_crc[kMinBufferSize];
+ uint32_t refcrc;
+
+ const char *pref_buffer;
+ char *poutput_buffer;
+ size_t file_size;
+ struct GzipOutInfo out_info[kMinBufferSize];
+ int iteration;
+ bool last_block;
+};
+
+// returns 0 on success, otherwise a non-zero failure code.
+int CompressFile(queue &q, std::string &input_file, std::vector outfilenames,
+ int iterations, bool report) {
+ size_t isz;
+ char *pinbuf;
+
+ // Read the input file
+ std::string device_string =
+ q.get_device().get_info().c_str();
+ bool prepin =
+ (device_string.find("s10") !=
+ std::string::npos); // Check if "s10" is found in the device string. If
+ // the device is S10, we pre-pin some buffers to
+ // improve DMA performance, which is needed to
+ // achieve peak kernel throughput. Pre-pinning is
+ // only supported on the PAC-S10 BSP. It's not
+ // needed on PAC-A10 to achieve peak performance.
+
+ std::ifstream file(input_file,
+ std::ios::in | std::ios::binary | std::ios::ate);
+ if (file.is_open()) {
+ isz = file.tellg();
+ if (prepin) {
+ pinbuf = (char *)malloc_host(
+ isz, q.get_context()); // Pre-pin the buffer, for faster DMA
+ } else { // throughput, using malloc_host().
+ pinbuf = new char[isz];
+ }
+ file.seekg(0, std::ios::beg);
+ file.read(pinbuf, isz);
+ file.close();
+ } else {
+ std::cout << "Error: cannot read specified input file\n";
+ return 1;
+ }
+
+ if (isz < minimum_filesize) {
+ std::cout << "Minimum filesize for compression is " << minimum_filesize
+ << "\n";
+ return 1;
+ }
+
+ int buffers_count = iterations;
+
+ // Create an array of kernel info structures and create buffers for kernel
+ // input/output. The buffers are re-used between iterations, but enough
+ // disjoint buffers are created to support double-buffering.
+ struct KernelInfo *kinfo[kNumEngines];
+ for (size_t eng = 0; eng < kNumEngines; eng++) {
+ kinfo[eng] =
+ (struct KernelInfo *)malloc(sizeof(struct KernelInfo) * buffers_count);
+ if (kinfo[eng] == NULL) {
+ std::cout << "Cannot allocate kernel info buffer.\n";
+ return 1;
+ }
+ for (int i = 0; i < buffers_count; i++) {
+ kinfo[eng][i].file_size = isz;
+ // Allocating slightly larger buffers (+ 16 * kVec) to account for
+ // granularity of kernel writes
+ int outputSize = kinfo[eng][i].file_size + 16 * kVec < kMinBufferSize
+ ? kMinBufferSize
+ : kinfo[eng][i].file_size + 16 * kVec;
+
+ // Pre-pin buffer using malloc_host() to improve DMA bandwidth.
+ if (i >= 3) {
+ kinfo[eng][i].poutput_buffer = kinfo[eng][i - 3].poutput_buffer;
+ } else {
+ if (prepin) {
+ kinfo[eng][i].poutput_buffer =
+ (char *)malloc_host(outputSize, q.get_context());
+ } else {
+ kinfo[eng][i].poutput_buffer = (char *)malloc(outputSize);
+ }
+ if (kinfo[eng][i].poutput_buffer == NULL) {
+ std::cout << "Cannot allocate output buffer.\n";
+ free(kinfo);
+ return 1;
+ }
+ // zero pages to fully allocate them
+ memset(kinfo[eng][i].poutput_buffer, 0, outputSize);
+ }
+
+ kinfo[eng][i].last_block = true;
+ kinfo[eng][i].iteration = i;
+ kinfo[eng][i].pref_buffer = pinbuf;
+
+ kinfo[eng][i].gzip_out_buf =
+ i >= 3 ? kinfo[eng][i - 3].gzip_out_buf
+ : new buffer(kMinBufferSize);
+ kinfo[eng][i].current_crc = i >= 3
+ ? kinfo[eng][i - 3].current_crc
+ : new buffer(kMinBufferSize);
+ kinfo[eng][i].pibuf = i >= 3
+ ? kinfo[eng][i - 3].pibuf
+ : new buffer(kinfo[eng][i].file_size);
+ kinfo[eng][i].pobuf =
+ i >= 3 ? kinfo[eng][i - 3].pobuf : new buffer(outputSize);
+ kinfo[eng][i].pobuf_decompress = (char *)malloc(kinfo[eng][i].file_size);
+ }
+ }
+
+ // Create events for the various parts of the execution so that we can profile
+ // their performance.
+ event e_input_dma [kNumEngines][buffers_count]; // Input to the GZIP engine. This is a transfer from host to device.
+ event e_output_dma [kNumEngines][buffers_count]; // Output from the GZIP engine. This is transfer from device to host.
+ event e_crc_dma [kNumEngines][buffers_count]; // Transfer CRC from device to host
+ event e_size_dma [kNumEngines][buffers_count]; // Transfer compressed file size from device to host
+ event e_k_crc [kNumEngines][buffers_count]; // CRC kernel
+ event e_k_lz [kNumEngines][buffers_count]; // LZ77 kernel
+ event e_k_huff [kNumEngines][buffers_count]; // Huffman Encoding kernel
+
+#ifndef FPGA_EMULATOR
+ dpc_common::TimeInterval perf_timer;
+#endif
+
+
+ /*************************************************/
+ /* Main loop where the actual execution happens */
+ /*************************************************/
+ for (int i = 0; i < buffers_count; i++) {
+ for (size_t eng = 0; eng < kNumEngines; eng++) {
+ // Transfer the input data, to be compressed, from host to device.
+ e_input_dma[eng][i] = q.submit([&](handler &h) {
+ auto in_data =
+ kinfo[eng][i].pibuf->get_access(h);
+ h.copy(kinfo[eng][i].pref_buffer, in_data);
+ });
+
+ /************************************/
+ /************************************/
+ /* LAUNCH GZIP ENGINE */
+ /************************************/
+ /************************************/
+ SubmitGzipTasks(q, kinfo[eng][i].file_size, kinfo[eng][i].pibuf,
+ kinfo[eng][i].pobuf, kinfo[eng][i].gzip_out_buf,
+ kinfo[eng][i].current_crc, kinfo[eng][i].last_block,
+ e_k_crc[eng][i], e_k_lz[eng][i], e_k_huff[eng][i], eng);
+
+ // Transfer the output (compressed) data from device to host.
+ e_output_dma[eng][i] = q.submit([&](handler &h) {
+ auto out_data = kinfo[eng][i].pobuf->get_access(h);
+ h.copy(out_data, kinfo[eng][i].poutput_buffer);
+ });
+
+ // Transfer the file size of the compressed output file from device to host.
+ e_size_dma[eng][i] = q.submit([&](handler &h) {
+ auto out_data =
+ kinfo[eng][i].gzip_out_buf->get_access(h);
+ h.copy(out_data, kinfo[eng][i].out_info);
+ });
+
+ // Transfer the CRC of the compressed output file from device to host.
+ e_crc_dma[eng][i] = q.submit([&](handler &h) {
+ auto out_data =
+ kinfo[eng][i].current_crc->get_access(h);
+ h.copy(out_data, kinfo[eng][i].buffer_crc);
+ });
+ }
+ }
+
+ // Wait for all kernels to complete
+ for (int eng = 0; eng < kNumEngines; eng++) {
+ for (int i = 0; i < buffers_count; i++) {
+ e_output_dma[eng][i].wait();
+ e_size_dma[eng][i].wait();
+ e_crc_dma[eng][i].wait();
+ }
+ }
+
+// Stop the timer.
+#ifndef FPGA_EMULATOR
+ double diff_total = perf_timer.Elapsed();
+ double gbps = iterations * isz / (double)diff_total / 1000000000.0;
+#endif
+
+ // Check the compressed file size from each iteration. Make sure the size is actually
+ // less-than-or-equal to the input size. Also calculate the remaining CRC.
+ size_t compressed_sz[kNumEngines];
+ for (int eng = 0; eng < kNumEngines; eng++) {
+ compressed_sz[eng] = 0;
+ for (int i = 0; i < buffers_count; i++) {
+ if (kinfo[eng][i].out_info[0].compression_sz > kinfo[eng][i].file_size) {
+ std::cerr << "Unsupported: compressed file larger than input file( "
+ << kinfo[eng][i].out_info[0].compression_sz << " )\n";
+ return 1;
+ }
+ // The majority of the CRC is calculated by the CRC kernel on the FPGA. But the kernel
+ // operates on quantized chunks of input data, so any remaining input data, that falls
+ // outside the quanta, is included in the overall CRC calculation via the following
+ // function that runs on the host. The last argument is the running CRC that was computed
+ // on the FPGA.
+ kinfo[eng][i].buffer_crc[0] =
+ Crc32(kinfo[eng][i].pref_buffer, kinfo[eng][i].file_size,
+ kinfo[eng][i].buffer_crc[0]);
+ // Accumulate the compressed size across all iterations. Used to
+ // compute compression ratio later.
+ compressed_sz[eng] += kinfo[eng][i].out_info[0].compression_sz;
+ }
+ }
+
+ // delete the file mapping now that all kernels are complete, and we've
+ // snapped the time delta
+ if (prepin) {
+ free(pinbuf, q.get_context());
+ } else {
+ delete pinbuf;
+ }
+
+ // Write the output compressed data from the first iteration of each engine, to a file.
+ for (int eng = 0; eng < kNumEngines; eng++) {
+ // WriteBlockGzip() returns 1 on failure
+ if (report && WriteBlockGzip(input_file, outfilenames[eng], kinfo[eng][0].poutput_buffer,
+ kinfo[eng][0].out_info[0].compression_sz,
+ kinfo[eng][0].file_size, kinfo[eng][0].buffer_crc[0])) {
+ std::cout << "FAILED\n";
+ return 1;
+ }
+ }
+
+ // Decompress the output from engine-0 and compare against the input file. Only engine-0's
+ // output is verified since all engines are fed the same input data.
+ if (report && CompareGzipFiles(input_file, outfilenames[0])) {
+ std::cout << "FAILED\n";
+ return 1;
+ }
+
+ // Generate throughput report
+ // First gather all the execution times.
+ size_t time_k_crc[kNumEngines];
+ size_t time_k_lz[kNumEngines];
+ size_t time_k_huff[kNumEngines];
+ size_t time_input_dma[kNumEngines];
+ size_t time_output_dma[kNumEngines];
+ for (int eng = 0; eng < kNumEngines; eng++) {
+ time_k_crc[eng] = 0;
+ time_k_lz[eng] = 0;
+ time_k_huff[eng] = 0;
+ time_input_dma[eng] = 0;
+ time_output_dma[eng] = 0;
+ for (int i = 0; i < buffers_count; i++) {
+ e_k_crc[eng][i].wait();
+ e_k_lz[eng][i].wait();
+ e_k_huff[eng][i].wait();
+ time_k_crc[eng] += SyclGetExecTimeNs(e_k_crc[eng][i]);
+ time_k_lz[eng] += SyclGetExecTimeNs(e_k_lz[eng][i]);
+ time_k_huff[eng] += SyclGetExecTimeNs(e_k_huff[eng][i]);
+ time_input_dma[eng] += SyclGetExecTimeNs(e_input_dma[eng][i]);
+ time_output_dma[eng] += SyclGetExecTimeNs(e_output_dma[eng][i]);
+ }
+ }
+
+ if (report) {
+ double compression_ratio =
+ (double)((double)compressed_sz[0] / (double)isz / iterations);
+#ifndef FPGA_EMULATOR
+ std::cout << "Throughput: " << kNumEngines * gbps << " GB/s\n\n";
+ for (int eng = 0; eng < kNumEngines; eng++) {
+ std::cout << "TP breakdown for engine #" << eng << " (GB/s)\n";
+ std::cout << "CRC = " << iterations * isz / (double)time_k_crc[eng]
+ << "\n";
+ std::cout << "LZ77 = " << iterations * isz / (double)time_k_lz[eng]
+ << "\n";
+ std::cout << "Huffman Encoding = "
+ << iterations * isz / (double)time_k_huff[eng] << "\n";
+ std::cout << "DMA host-to-device = "
+ << iterations * isz / (double)time_input_dma[eng] << "\n";
+ std::cout << "DMA device-to-host = "
+ << iterations * isz / (double)time_output_dma[eng] << "\n\n";
+ }
+#endif
+ std::cout << "Compression Ratio " << compression_ratio * 100 << "%\n";
+ }
+
+ // Cleanup anything that was allocated by this routine.
+ for (int eng = 0; eng < kNumEngines; eng++) {
+ for (int i = 0; i < buffers_count; i++) {
+ if (i < 3) {
+ delete kinfo[eng][i].gzip_out_buf;
+ delete kinfo[eng][i].current_crc;
+ delete kinfo[eng][i].pibuf;
+ delete kinfo[eng][i].pobuf;
+ if (prepin) {
+ free(kinfo[eng][i].poutput_buffer, q.get_context());
+ } else {
+ free(kinfo[eng][i].poutput_buffer);
+ }
+ }
+ free(kinfo[eng][i].pobuf_decompress);
+ }
+ free(kinfo[eng]);
+ }
+
+ if (report) std::cout << "PASSED\n";
+ return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.cpp
new file mode 100755
index 0000000000..01d69c1f9b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.cpp
@@ -0,0 +1,2406 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+/*
+ * Copyright (C) 1995-2006, 2010, 2011, 2012, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include
+
+#include "gzipkernel.hpp"
+#include "kernels.hpp"
+
+using namespace sycl;
+
+// This reference design uses a template-based unroller. It's also possible
+// to specify this in a more concise way using a pragma. See the loop unroll
+// tutorial for more information.
+template
+struct Unroller {
+ template
+ static void step(const Action &action) {
+ action(Begin);
+ Unroller::step(action);
+ }
+};
+
+template
+struct Unroller {
+ template
+ static void step(const Action &action) {}
+};
+
+int GetHuffLiteralBits(unsigned char ch) {
+ CtData static_ltree[kLCodes + 2] = {
+ {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8},
+ {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8},
+ {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8},
+ {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8},
+ {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8},
+ {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8},
+ {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8},
+ {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8},
+ {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8},
+ {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8},
+ {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8},
+ {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8},
+ {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8},
+ {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8},
+ {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8},
+ {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8},
+ {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8},
+ {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8},
+ {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8},
+ {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8},
+ {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9},
+ {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9},
+ {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+ {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9},
+ {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+ {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9},
+ {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9},
+ {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9},
+ {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9},
+ {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9},
+ {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9},
+ {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+ {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9},
+ {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+ {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9},
+ {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9},
+ {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7},
+ {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7},
+ {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7},
+ {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7},
+ {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8},
+ {227, 8},
+ };
+ return static_ltree[ch].code;
+}
+
+int GetHuffLiteralLen(unsigned char ch) {
+ CtData static_ltree[kLCodes + 2] = {
+ {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8},
+ {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8},
+ {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8},
+ {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8},
+ {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8},
+ {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8},
+ {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8},
+ {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8},
+ {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8},
+ {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8},
+ {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8},
+ {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8},
+ {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8},
+ {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8},
+ {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8},
+ {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8},
+ {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8},
+ {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8},
+ {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8},
+ {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8},
+ {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9},
+ {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9},
+ {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+ {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9},
+ {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+ {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9},
+ {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9},
+ {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9},
+ {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9},
+ {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9},
+ {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9},
+ {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+ {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9},
+ {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+ {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9},
+ {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9},
+ {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7},
+ {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7},
+ {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7},
+ {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7},
+ {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8},
+ {227, 8},
+ };
+ return static_ltree[ch].len;
+}
+
+int GetHuffRunLen(int len, int initial_dist) {
+ int lc;
+ unsigned code;
+ int extra;
+ int dist;
+ int local_lbits, local_llen;
+ int local_dbits, local_dlen;
+ local_lbits = 0;
+ local_llen = 0;
+
+ int base_length[kLengthCodes] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24,
+ 28, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 0,
+ };
+
+ int extra_lbits[kLengthCodes] // extra bits for each length code
+ = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
+ 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0};
+
+ CtData static_ltree[kLCodes + 2] = {
+ {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8},
+ {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8},
+ {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8},
+ {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8},
+ {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8},
+ {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8},
+ {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8},
+ {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8},
+ {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8},
+ {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8},
+ {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8},
+ {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8},
+ {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8},
+ {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8},
+ {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8},
+ {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8},
+ {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8},
+ {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8},
+ {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8},
+ {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8},
+ {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9},
+ {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9},
+ {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+ {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9},
+ {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+ {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9},
+ {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9},
+ {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9},
+ {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9},
+ {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9},
+ {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9},
+ {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+ {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9},
+ {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+ {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9},
+ {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9},
+ {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7},
+ {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7},
+ {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7},
+ {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7},
+ {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8},
+ {227, 8},
+ };
+
+ // distance codes. The first 256 values correspond to the distances
+ // 3 .. 258, the last 256 values correspond to the top 8 bits of
+ // the 15 bit distances.
+ unsigned char dist_code[512] = {
+ 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8,
+ 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 0, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21,
+ 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29,
+ };
+ // length code for each normalized match length (0 == kMinMatch)
+ unsigned char length_code[kMaxMatch - kMinMatch + 1] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
+ 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16,
+ 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18,
+ 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+ 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 28,
+ };
+
+ int extra_dbits[kDCodes] // extra bits for each distance code
+ = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
+
+ int base_dist[kDCodes] = {
+ 0, 1, 2, 3, 4, 6, 8, 12, 16, 24,
+ 32, 48, 64, 96, 128, 192, 256, 384, 512, 768,
+ 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576,
+ };
+
+ CtData static_dtree[kDCodes] = {
+ {0, 5}, {16, 5}, {8, 5}, {24, 5}, {4, 5}, {20, 5}, {12, 5}, {28, 5},
+ {2, 5}, {18, 5}, {10, 5}, {26, 5}, {6, 5}, {22, 5}, {14, 5}, {30, 5},
+ {1, 5}, {17, 5}, {9, 5}, {25, 5}, {5, 5}, {21, 5}, {13, 5}, {29, 5},
+ {3, 5}, {19, 5}, {11, 5}, {27, 5}, {7, 5}, {23, 5},
+ };
+
+ lc = len - kMinMatch;
+ code = length_code[lc];
+
+ local_lbits = static_ltree[code + kLiterals + 1].code;
+ local_llen = static_ltree[code + kLiterals + 1].len;
+ extra = extra_lbits[code];
+ if (extra) {
+ lc -= base_length[code];
+ local_lbits |= lc << local_llen;
+ local_llen += extra;
+ }
+
+ dist = initial_dist;
+ dist--;
+ code = d_code(dist);
+ local_dbits = static_dtree[code].code;
+ local_dlen = static_dtree[code].len;
+ extra = extra_dbits[code];
+ if (extra) {
+ dist -= base_dist[code];
+ local_dbits |= dist << local_dlen;
+ local_dlen += extra;
+ }
+
+ local_lbits |= local_dbits << local_llen;
+ local_llen += local_dlen;
+
+ return local_llen;
+}
+
+int GetHuffRunBits(int len, int initial_dist) {
+ int lc;
+ unsigned code;
+ int extra;
+ int dist;
+ int local_lbits, local_llen;
+ int local_dbits, local_dlen;
+ local_lbits = 0;
+ local_llen = 0;
+
+ int base_length[kLengthCodes] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24,
+ 28, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 0,
+ };
+
+ int extra_lbits[kLengthCodes] // extra bits for each length code
+ = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
+ 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0};
+
+ CtData static_ltree[kLCodes + 2] = {
+ {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8},
+ {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8},
+ {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8},
+ {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8},
+ {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8},
+ {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8},
+ {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8},
+ {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8},
+ {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8},
+ {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8},
+ {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8},
+ {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8},
+ {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8},
+ {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8},
+ {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8},
+ {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8},
+ {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8},
+ {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8},
+ {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8},
+ {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8},
+ {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9},
+ {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9},
+ {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+ {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9},
+ {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+ {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9},
+ {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9},
+ {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9},
+ {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9},
+ {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9},
+ {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9},
+ {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+ {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9},
+ {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+ {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9},
+ {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9},
+ {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7},
+ {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7},
+ {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7},
+ {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7},
+ {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8},
+ {227, 8},
+ };
+
+ // distance codes. The first 256 values correspond to the distances
+ // 3 .. 258, the last 256 values correspond to the top 8 bits of
+ // the 15 bit distances.
+ unsigned char dist_code[512] = {
+ 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8,
+ 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 0, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21,
+ 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29,
+ };
+ // length code for each normalized match length (0 == kMinMatch)
+ unsigned char length_code[kMaxMatch - kMinMatch + 1] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
+ 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16,
+ 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18,
+ 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+ 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+ 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 28,
+ };
+
+ int extra_dbits[kDCodes] // extra bits for each distance code
+ = {0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
+
+ int base_dist[kDCodes] = {
+ 0, 1, 2, 3, 4, 6, 8, 12, 16, 24,
+ 32, 48, 64, 96, 128, 192, 256, 384, 512, 768,
+ 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576,
+ };
+
+ CtData static_dtree[kDCodes] = {
+ {0, 5}, {16, 5}, {8, 5}, {24, 5}, {4, 5}, {20, 5}, {12, 5}, {28, 5},
+ {2, 5}, {18, 5}, {10, 5}, {26, 5}, {6, 5}, {22, 5}, {14, 5}, {30, 5},
+ {1, 5}, {17, 5}, {9, 5}, {25, 5}, {5, 5}, {21, 5}, {13, 5}, {29, 5},
+ {3, 5}, {19, 5}, {11, 5}, {27, 5}, {7, 5}, {23, 5},
+ };
+
+ lc = len - kMinMatch;
+ code = length_code[lc];
+
+ local_lbits = static_ltree[code + kLiterals + 1].code;
+ local_llen = static_ltree[code + kLiterals + 1].len;
+ extra = extra_lbits[code];
+ if (extra) {
+ lc -= base_length[code];
+ local_lbits |= lc << local_llen;
+ local_llen += extra;
+ }
+
+ dist = initial_dist;
+ dist--;
+ code = d_code(dist);
+ local_dbits = static_dtree[code].code;
+ local_dlen = static_dtree[code].len;
+ extra = extra_dbits[code];
+ if (extra) {
+ dist -= base_dist[code];
+ local_dbits |= dist << local_dlen;
+ local_dlen += extra;
+ }
+
+ local_lbits |= local_dbits << local_llen;
+ local_llen += local_dlen;
+
+ return local_lbits;
+}
+
+int GetHuffLen(int len, int dist, unsigned char ch) {
+ int returned_len;
+
+ CtData static_ltree[kLCodes + 2] = {
+ {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8},
+ {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8},
+ {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8},
+ {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8},
+ {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8},
+ {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8},
+ {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8},
+ {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8},
+ {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8},
+ {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8},
+ {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8},
+ {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8},
+ {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8},
+ {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8},
+ {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8},
+ {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8},
+ {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8},
+ {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8},
+ {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8},
+ {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8},
+ {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9},
+ {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9},
+ {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+ {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9},
+ {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+ {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9},
+ {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9},
+ {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9},
+ {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9},
+ {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9},
+ {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9},
+ {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+ {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9},
+ {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+ {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9},
+ {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9},
+ {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7},
+ {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7},
+ {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7},
+ {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7},
+ {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8},
+ {227, 8},
+ };
+ switch (len) {
+ case -3:
+ returned_len = static_ltree[kEndBlock].len;
+ break;
+ case -2:
+ returned_len = 3;
+ break;
+ case -1:
+ returned_len = 0;
+ break;
+ case 0:
+ returned_len = GetHuffLiteralLen(ch);
+ break;
+ default:
+ returned_len = GetHuffRunLen(len, dist);
+ break;
+ }
+ return returned_len;
+}
+
+int IsValid(int len, int dist, unsigned char ch) {
+ switch (len) {
+ case -3:
+ return 1;
+ case -2:
+ return 1;
+ case -1:
+ return 0;
+ case 0:
+ return 1;
+ default:
+ return 1;
+ }
+}
+
+int GetHuffBits(int len, int dist, unsigned char ch) {
+ int bits;
+ CtData static_ltree[kLCodes + 2] = {
+ {12, 8}, {140, 8}, {76, 8}, {204, 8}, {44, 8}, {172, 8}, {108, 8},
+ {236, 8}, {28, 8}, {156, 8}, {92, 8}, {220, 8}, {60, 8}, {188, 8},
+ {124, 8}, {252, 8}, {2, 8}, {130, 8}, {66, 8}, {194, 8}, {34, 8},
+ {162, 8}, {98, 8}, {226, 8}, {18, 8}, {146, 8}, {82, 8}, {210, 8},
+ {50, 8}, {178, 8}, {114, 8}, {242, 8}, {10, 8}, {138, 8}, {74, 8},
+ {202, 8}, {42, 8}, {170, 8}, {106, 8}, {234, 8}, {26, 8}, {154, 8},
+ {90, 8}, {218, 8}, {58, 8}, {186, 8}, {122, 8}, {250, 8}, {6, 8},
+ {134, 8}, {70, 8}, {198, 8}, {38, 8}, {166, 8}, {102, 8}, {230, 8},
+ {22, 8}, {150, 8}, {86, 8}, {214, 8}, {54, 8}, {182, 8}, {118, 8},
+ {246, 8}, {14, 8}, {142, 8}, {78, 8}, {206, 8}, {46, 8}, {174, 8},
+ {110, 8}, {238, 8}, {30, 8}, {158, 8}, {94, 8}, {222, 8}, {62, 8},
+ {190, 8}, {126, 8}, {254, 8}, {1, 8}, {129, 8}, {65, 8}, {193, 8},
+ {33, 8}, {161, 8}, {97, 8}, {225, 8}, {17, 8}, {145, 8}, {81, 8},
+ {209, 8}, {49, 8}, {177, 8}, {113, 8}, {241, 8}, {9, 8}, {137, 8},
+ {73, 8}, {201, 8}, {41, 8}, {169, 8}, {105, 8}, {233, 8}, {25, 8},
+ {153, 8}, {89, 8}, {217, 8}, {57, 8}, {185, 8}, {121, 8}, {249, 8},
+ {5, 8}, {133, 8}, {69, 8}, {197, 8}, {37, 8}, {165, 8}, {101, 8},
+ {229, 8}, {21, 8}, {149, 8}, {85, 8}, {213, 8}, {53, 8}, {181, 8},
+ {117, 8}, {245, 8}, {13, 8}, {141, 8}, {77, 8}, {205, 8}, {45, 8},
+ {173, 8}, {109, 8}, {237, 8}, {29, 8}, {157, 8}, {93, 8}, {221, 8},
+ {61, 8}, {189, 8}, {125, 8}, {253, 8}, {19, 9}, {275, 9}, {147, 9},
+ {403, 9}, {83, 9}, {339, 9}, {211, 9}, {467, 9}, {51, 9}, {307, 9},
+ {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+ {267, 9}, {139, 9}, {395, 9}, {75, 9}, {331, 9}, {203, 9}, {459, 9},
+ {43, 9}, {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+ {491, 9}, {27, 9}, {283, 9}, {155, 9}, {411, 9}, {91, 9}, {347, 9},
+ {219, 9}, {475, 9}, {59, 9}, {315, 9}, {187, 9}, {443, 9}, {123, 9},
+ {379, 9}, {251, 9}, {507, 9}, {7, 9}, {263, 9}, {135, 9}, {391, 9},
+ {71, 9}, {327, 9}, {199, 9}, {455, 9}, {39, 9}, {295, 9}, {167, 9},
+ {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9}, {279, 9},
+ {151, 9}, {407, 9}, {87, 9}, {343, 9}, {215, 9}, {471, 9}, {55, 9},
+ {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+ {15, 9}, {271, 9}, {143, 9}, {399, 9}, {79, 9}, {335, 9}, {207, 9},
+ {463, 9}, {47, 9}, {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+ {239, 9}, {495, 9}, {31, 9}, {287, 9}, {159, 9}, {415, 9}, {95, 9},
+ {351, 9}, {223, 9}, {479, 9}, {63, 9}, {319, 9}, {191, 9}, {447, 9},
+ {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7}, {64, 7}, {32, 7},
+ {96, 7}, {16, 7}, {80, 7}, {48, 7}, {112, 7}, {8, 7}, {72, 7},
+ {40, 7}, {104, 7}, {24, 7}, {88, 7}, {56, 7}, {120, 7}, {4, 7},
+ {68, 7}, {36, 7}, {100, 7}, {20, 7}, {84, 7}, {52, 7}, {116, 7},
+ {3, 8}, {131, 8}, {67, 8}, {195, 8}, {35, 8}, {163, 8}, {99, 8},
+ {227, 8},
+ };
+ switch (len) {
+ case -3:
+ bits = static_ltree[kEndBlock].code;
+ break;
+ case -2:
+ bits = ch;
+ break;
+ case -1:
+ bits = 0;
+ break;
+ case 0:
+ bits = GetHuffLiteralBits(ch);
+ break;
+ default:
+ bits = GetHuffRunBits(len, dist);
+ break;
+ }
+ return bits;
+}
+
+// assembles up to kVecX2 unsigned char values based on given huffman encoding
+// writes up to kMaxHuffcodeBits * kVecX2 bits to memory
+bool HufEnc(char *len, short *dist, unsigned char *data, unsigned int *outdata,
+ unsigned int *leftover, unsigned short *leftover_size) {
+ // array that contains the bit position of each symbol
+ unsigned short bitpos[kVec + 1];
+ bitpos[0] = 0;
+
+ Unroller<0, kVec>::step([&](int i) {
+ bitpos[i + 1] = bitpos[i] + (IsValid(len[i], dist[i], data[i])
+ ? GetHuffLen(len[i], dist[i], data[i])
+ : 0);
+ });
+
+ // leftover is an array that carries huffman encoded data not yet written to
+ // memory adjust leftover_size with the number of bits to write this time
+ unsigned short prev_cycle_offset = *leftover_size;
+ *leftover_size += (bitpos[kVec] & 0x3fff);
+
+ // we'll write this cycle if we have collected enough data (kVec shorts or
+ // more)
+ bool write = *leftover_size & (kVec * (kMaxHuffcodeBits * 2));
+
+ // subtract kVec shorts from leftover size (if it's bigger
+ // than kVec) because we'll write those out this cycle
+ *leftover_size &= ~(kVec * (kMaxHuffcodeBits * 2));
+
+ // Adjust bitpos based on leftover offset from previous cycle
+ Unroller<0, kVec>::step(
+ [&](int i) { bitpos[i] += (prev_cycle_offset & 0x3fff); });
+
+ // Huffman codes have any bit alignement, so they can spill
+ // onto two shorts in the output array
+ // use ushort2 to keep each part of the code separate
+ // Iterate over all codes and construct ushort2 containing
+ // the code properly aligned
+ struct Uint2Gzip code[kVec];
+ Unroller<0, kVec>::step([&](int i) {
+ code[i].x = 0;
+ code[i].y = 0;
+ });
+
+ Unroller<0, kVec>::step([&](int i) {
+ // Codes can be more than 16 bits, so use uint32
+ unsigned int curr_code = GetHuffBits(len[i], dist[i], data[i]);
+ unsigned char bitpos_in_short = bitpos[i] & 0x01F;
+
+ unsigned long long temp = (unsigned long long)curr_code << bitpos_in_short;
+ unsigned int temp1 = (unsigned int)temp;
+ unsigned int temp2 = temp >> 32ULL;
+
+ if (IsValid(len[i], dist[i], data[i])) {
+ code[i].x = temp1;
+ code[i].y = temp2;
+ } else {
+ code[i].x = temp1;
+ code[i].y = temp2;
+ }
+ });
+
+ // Iterate over all destination locations and gather the required data
+ unsigned int new_leftover[kVec];
+ Unroller<0, kVec>::step([&](int i) {
+ new_leftover[i] = 0;
+ outdata[i] = 0;
+
+ Unroller<0, kVec>::step([&](int j) {
+ // figure out whether code[j] goes into bucket[i]
+ bool match_first = ((bitpos[j] >> 5) & (kVec - 1)) == i;
+ bool match_second =
+ ((bitpos[j] >> 5) & (kVec - 1)) == ((i - 1) & (kVec - 1));
+
+ // if code[j] maps onto current bucket then OR its code, else OR with 0
+ unsigned int component =
+ match_first ? code[j].x : (match_second ? code[j].y : 0);
+
+ // overflow from kVec shorts, need to move onto new_leftover
+ bool use_later =
+ (bitpos[j] & (kVec * (kMaxHuffcodeBits * 2))) ||
+ (match_second && (((bitpos[j] >> 5) & (kVec - 1)) == kVec - 1));
+
+ // write to output
+ outdata[i] |= use_later ? 0 : component;
+ new_leftover[i] |= use_later ? component : 0;
+ });
+ });
+
+ // Apply previous leftover on the outdata
+ // Also, if didn't write, apply prev leftover onto newleftover
+ Unroller<0, kVec>::step([&](int i) {
+ outdata[i] |= leftover[i];
+ leftover[i] = outdata[i];
+ });
+
+ // Split unroll into two unrolls to avoid compiler crash. This is a temporary
+ // workaround while awaiting a compiler feature.
+ if (write) {
+ Unroller<0, kVec>::step([&](int i) { leftover[i] = new_leftover[i]; });
+ }
+
+ return write;
+}
+
+template
+class CRC;
+template
+class LZReduction;
+template
+class StaticHuffman;
+template
+void SubmitGzipTasksSingleEngine(
+ queue &q,
+ size_t block_size, // size of block to compress.
+ buffer *pibuf, buffer *pobuf,
+ buffer *gzip_out_buf,
+ buffer *result_crc, bool last_block, event &e_crc, event &e_lz,
+ event &e_huff) {
+ using acc_dist_channel = intel::pipe;
+ using acc_dist_channel_last = intel::pipe;
+
+ e_crc = q.submit([&](handler &h) {
+ auto accessor_isz = block_size;
+ auto acc_pibuf = pibuf->get_access(h);
+ auto accresult_crc = result_crc->get_access(h);
+ h.single_task>([=]() [[intel::kernel_args_restrict]] {
+ const unsigned int table64[64][16] = {
+ {
+ 0x0,
+ 0xf1da05aa,
+ 0x38c50d15,
+ 0xc91f08bf,
+ 0x718a1a2a,
+ 0x80501f80,
+ 0x494f173f,
+ 0xb8951295,
+ 0xe3143454,
+ 0x12ce31fe,
+ 0xdbd13941,
+ 0x2a0b3ceb,
+ 0x929e2e7e,
+ 0x63442bd4,
+ 0xaa5b236b,
+ 0x5b8126c1,
+ },
+
+ {
+ 0x0,
+ 0x1d596ee9,
+ 0x3ab2ddd2,
+ 0x27ebb33b,
+ 0x7565bba4,
+ 0x683cd54d,
+ 0x4fd76676,
+ 0x528e089f,
+ 0xeacb7748,
+ 0xf79219a1,
+ 0xd079aa9a,
+ 0xcd20c473,
+ 0x9faeccec,
+ 0x82f7a205,
+ 0xa51c113e,
+ 0xb8457fd7,
+ },
+
+ {
+ 0x0,
+ 0xee7e8d1,
+ 0x1dcfd1a2,
+ 0x13283973,
+ 0x3b9fa344,
+ 0x35784b95,
+ 0x265072e6,
+ 0x28b79a37,
+ 0x773f4688,
+ 0x79d8ae59,
+ 0x6af0972a,
+ 0x64177ffb,
+ 0x4ca0e5cc,
+ 0x42470d1d,
+ 0x516f346e,
+ 0x5f88dcbf,
+ },
+
+ {
+ 0x0,
+ 0xee7e8d10,
+ 0x78c1c61,
+ 0xe9f29171,
+ 0xf1838c2,
+ 0xe166b5d2,
+ 0x89424a3,
+ 0xe6eaa9b3,
+ 0x1e307184,
+ 0xf04efc94,
+ 0x19bc6de5,
+ 0xf7c2e0f5,
+ 0x11284946,
+ 0xff56c456,
+ 0x16a45527,
+ 0xf8dad837,
+ },
+
+ {
+ 0x0,
+ 0x3c60e308,
+ 0x78c1c610,
+ 0x44a12518,
+ 0xf1838c20,
+ 0xcde36f28,
+ 0x89424a30,
+ 0xb522a938,
+ 0x38761e01,
+ 0x416fd09,
+ 0x40b7d811,
+ 0x7cd73b19,
+ 0xc9f59221,
+ 0xf5957129,
+ 0xb1345431,
+ 0x8d54b739,
+ },
+
+ {
+ 0x0,
+ 0x70ec3c02,
+ 0xe1d87804,
+ 0x91344406,
+ 0x18c1f649,
+ 0x682dca4b,
+ 0xf9198e4d,
+ 0x89f5b24f,
+ 0x3183ec92,
+ 0x416fd090,
+ 0xd05b9496,
+ 0xa0b7a894,
+ 0x29421adb,
+ 0x59ae26d9,
+ 0xc89a62df,
+ 0xb8765edd,
+ },
+
+ {
+ 0x0,
+ 0x6307d924,
+ 0xc60fb248,
+ 0xa5086b6c,
+ 0x576e62d1,
+ 0x3469bbf5,
+ 0x9161d099,
+ 0xf26609bd,
+ 0xaedcc5a2,
+ 0xcddb1c86,
+ 0x68d377ea,
+ 0xbd4aece,
+ 0xf9b2a773,
+ 0x9ab57e57,
+ 0x3fbd153b,
+ 0x5cbacc1f,
+ },
+
+ {
+ 0x0,
+ 0x86c88d05,
+ 0xd6e01c4b,
+ 0x5028914e,
+ 0x76b13ed7,
+ 0xf079b3d2,
+ 0xa051229c,
+ 0x2699af99,
+ 0xed627dae,
+ 0x6baaf0ab,
+ 0x3b8261e5,
+ 0xbd4aece0,
+ 0x9bd34379,
+ 0x1d1bce7c,
+ 0x4d335f32,
+ 0xcbfbd237,
+ },
+
+ {
+ 0x0,
+ 0x1b5fd1d,
+ 0x36bfa3a,
+ 0x2de0727,
+ 0x6d7f474,
+ 0x7620969,
+ 0x5bc0e4e,
+ 0x409f353,
+ 0xdafe8e8,
+ 0xc1a15f5,
+ 0xec412d2,
+ 0xf71efcf,
+ 0xb781c9c,
+ 0xacde181,
+ 0x813e6a6,
+ 0x9a61bbb,
+ },
+
+ {
+ 0x0,
+ 0x1b5fd1d0,
+ 0x36bfa3a0,
+ 0x2de07270,
+ 0x6d7f4740,
+ 0x76209690,
+ 0x5bc0e4e0,
+ 0x409f3530,
+ 0xdafe8e80,
+ 0xc1a15f50,
+ 0xec412d20,
+ 0xf71efcf0,
+ 0xb781c9c0,
+ 0xacde1810,
+ 0x813e6a60,
+ 0x9a61bbb0,
+ },
+
+ {
+ 0x0,
+ 0x6e8c1b41,
+ 0xdd183682,
+ 0xb3942dc3,
+ 0x61416b45,
+ 0xfcd7004,
+ 0xbc595dc7,
+ 0xd2d54686,
+ 0xc282d68a,
+ 0xac0ecdcb,
+ 0x1f9ae008,
+ 0x7116fb49,
+ 0xa3c3bdcf,
+ 0xcd4fa68e,
+ 0x7edb8b4d,
+ 0x1057900c,
+ },
+
+ {
+ 0x0,
+ 0x5e74ab55,
+ 0xbce956aa,
+ 0xe29dfdff,
+ 0xa2a3ab15,
+ 0xfcd70040,
+ 0x1e4afdbf,
+ 0x403e56ea,
+ 0x9e36506b,
+ 0xc042fb3e,
+ 0x22df06c1,
+ 0x7cabad94,
+ 0x3c95fb7e,
+ 0x62e1502b,
+ 0x807cadd4,
+ 0xde080681,
+ },
+
+ {
+ 0x0,
+ 0xe71da697,
+ 0x154a4b6f,
+ 0xf257edf8,
+ 0x2a9496de,
+ 0xcd893049,
+ 0x3fdeddb1,
+ 0xd8c37b26,
+ 0x55292dbc,
+ 0xb2348b2b,
+ 0x406366d3,
+ 0xa77ec044,
+ 0x7fbdbb62,
+ 0x98a01df5,
+ 0x6af7f00d,
+ 0x8dea569a,
+ },
+
+ {
+ 0x0,
+ 0xaa525b78,
+ 0x8fd5b0b1,
+ 0x2587ebc9,
+ 0xc4da6723,
+ 0x6e883c5b,
+ 0x4b0fd792,
+ 0xe15d8cea,
+ 0x52c5c807,
+ 0xf897937f,
+ 0xdd1078b6,
+ 0x774223ce,
+ 0x961faf24,
+ 0x3c4df45c,
+ 0x19ca1f95,
+ 0xb39844ed,
+ },
+
+ {
+ 0x0,
+ 0xa58b900e,
+ 0x9066265d,
+ 0x35edb653,
+ 0xfbbd4afb,
+ 0x5e36daf5,
+ 0x6bdb6ca6,
+ 0xce50fca8,
+ 0x2c0b93b7,
+ 0x898003b9,
+ 0xbc6db5ea,
+ 0x19e625e4,
+ 0xd7b6d94c,
+ 0x723d4942,
+ 0x47d0ff11,
+ 0xe25b6f1f,
+ },
+
+ {
+ 0x0,
+ 0x5817276e,
+ 0xb02e4edc,
+ 0xe83969b2,
+ 0xbb2d9bf9,
+ 0xe33abc97,
+ 0xb03d525,
+ 0x5314f24b,
+ 0xad2a31b3,
+ 0xf53d16dd,
+ 0x1d047f6f,
+ 0x45135801,
+ 0x1607aa4a,
+ 0x4e108d24,
+ 0xa629e496,
+ 0xfe3ec3f8,
+ },
+
+ {
+ 0x0,
+ 0x81256527,
+ 0xd93bcc0f,
+ 0x581ea928,
+ 0x69069e5f,
+ 0xe823fb78,
+ 0xb03d5250,
+ 0x31183777,
+ 0xd20d3cbe,
+ 0x53285999,
+ 0xb36f0b1,
+ 0x8a139596,
+ 0xbb0ba2e1,
+ 0x3a2ec7c6,
+ 0x62306eee,
+ 0xe3150bc9,
+ },
+
+ {
+ 0x0,
+ 0x7f6b7f3d,
+ 0xfed6fe7a,
+ 0x81bd8147,
+ 0x26dcfab5,
+ 0x59b78588,
+ 0xd80a04cf,
+ 0xa7617bf2,
+ 0x4db9f56a,
+ 0x32d28a57,
+ 0xb36f0b10,
+ 0xcc04742d,
+ 0x6b650fdf,
+ 0x140e70e2,
+ 0x95b3f1a5,
+ 0xead88e98,
+ },
+
+ {
+ 0x0,
+ 0x9b73ead4,
+ 0xed96d3e9,
+ 0x76e5393d,
+ 0x5ca193,
+ 0x9b2f4b47,
+ 0xedca727a,
+ 0x76b998ae,
+ 0xb94326,
+ 0x9bcaa9f2,
+ 0xed2f90cf,
+ 0x765c7a1b,
+ 0xe5e2b5,
+ 0x9b960861,
+ 0xed73315c,
+ 0x7600db88,
+ },
+
+ {
+ 0x0,
+ 0x172864c,
+ 0x2e50c98,
+ 0x3978ad4,
+ 0x5ca1930,
+ 0x4b89f7c,
+ 0x72f15a8,
+ 0x65d93e4,
+ 0xb943260,
+ 0xae6b42c,
+ 0x9713ef8,
+ 0x803b8b4,
+ 0xe5e2b50,
+ 0xf2cad1c,
+ 0xcbb27c8,
+ 0xdc9a184,
+ },
+
+ {
+ 0x0,
+ 0x172864c0,
+ 0x2e50c980,
+ 0x3978ad40,
+ 0x5ca19300,
+ 0x4b89f7c0,
+ 0x72f15a80,
+ 0x65d93e40,
+ 0xb9432600,
+ 0xae6b42c0,
+ 0x9713ef80,
+ 0x803b8b40,
+ 0xe5e2b500,
+ 0xf2cad1c0,
+ 0xcbb27c80,
+ 0xdc9a1840,
+ },
+
+ {
+ 0x0,
+ 0xa9f74a41,
+ 0x889f92c3,
+ 0x2168d882,
+ 0xca4e23c7,
+ 0x63b96986,
+ 0x42d1b104,
+ 0xeb26fb45,
+ 0x4fed41cf,
+ 0xe61a0b8e,
+ 0xc772d30c,
+ 0x6e85994d,
+ 0x85a36208,
+ 0x2c542849,
+ 0xd3cf0cb,
+ 0xa4cbba8a,
+ },
+
+ {
+ 0x0,
+ 0x9fda839e,
+ 0xe4c4017d,
+ 0x7b1e82e3,
+ 0x12f904bb,
+ 0x8d238725,
+ 0xf63d05c6,
+ 0x69e78658,
+ 0x25f20976,
+ 0xba288ae8,
+ 0xc136080b,
+ 0x5eec8b95,
+ 0x370b0dcd,
+ 0xa8d18e53,
+ 0xd3cf0cb0,
+ 0x4c158f2e,
+ },
+
+ {
+ 0x0,
+ 0x4be412ec,
+ 0x97c825d8,
+ 0xdc2c3734,
+ 0xf4e14df1,
+ 0xbf055f1d,
+ 0x63296829,
+ 0x28cd7ac5,
+ 0x32b39da3,
+ 0x79578f4f,
+ 0xa57bb87b,
+ 0xee9faa97,
+ 0xc652d052,
+ 0x8db6c2be,
+ 0x519af58a,
+ 0x1a7ee766,
+ },
+
+ {
+ 0x0,
+ 0x65673b46,
+ 0xcace768c,
+ 0xafa94dca,
+ 0x4eedeb59,
+ 0x2b8ad01f,
+ 0x84239dd5,
+ 0xe144a693,
+ 0x9ddbd6b2,
+ 0xf8bcedf4,
+ 0x5715a03e,
+ 0x32729b78,
+ 0xd3363deb,
+ 0xb65106ad,
+ 0x19f84b67,
+ 0x7c9f7021,
+ },
+
+ {
+ 0x0,
+ 0xe0c6ab25,
+ 0x1afc500b,
+ 0xfa3afb2e,
+ 0x35f8a016,
+ 0xd53e0b33,
+ 0x2f04f01d,
+ 0xcfc25b38,
+ 0x6bf1402c,
+ 0x8b37eb09,
+ 0x710d1027,
+ 0x91cbbb02,
+ 0x5e09e03a,
+ 0xbecf4b1f,
+ 0x44f5b031,
+ 0xa4331b14,
+ },
+
+ {
+ 0x0,
+ 0xd7e28058,
+ 0x74b406f1,
+ 0xa35686a9,
+ 0xe9680de2,
+ 0x3e8a8dba,
+ 0x9ddc0b13,
+ 0x4a3e8b4b,
+ 0x9a11d85,
+ 0xde439ddd,
+ 0x7d151b74,
+ 0xaaf79b2c,
+ 0xe0c91067,
+ 0x372b903f,
+ 0x947d1696,
+ 0x439f96ce,
+ },
+
+ {
+ 0x0,
+ 0x13423b0a,
+ 0x26847614,
+ 0x35c64d1e,
+ 0x4d08ec28,
+ 0x5e4ad722,
+ 0x6b8c9a3c,
+ 0x78cea136,
+ 0x9a11d850,
+ 0x8953e35a,
+ 0xbc95ae44,
+ 0xafd7954e,
+ 0xd7193478,
+ 0xc45b0f72,
+ 0xf19d426c,
+ 0xe2df7966,
+ },
+
+ {
+ 0x0,
+ 0xef52b6e1,
+ 0x5d46b83,
+ 0xea86dd62,
+ 0xba8d706,
+ 0xe4fa61e7,
+ 0xe7cbc85,
+ 0xe12e0a64,
+ 0x1751ae0c,
+ 0xf80318ed,
+ 0x1285c58f,
+ 0xfdd7736e,
+ 0x1cf9790a,
+ 0xf3abcfeb,
+ 0x192d1289,
+ 0xf67fa468,
+ },
+
+ {
+ 0x0,
+ 0x2ea35c18,
+ 0x5d46b830,
+ 0x73e5e428,
+ 0xba8d7060,
+ 0x942e2c78,
+ 0xe7cbc850,
+ 0xc9689448,
+ 0xae6be681,
+ 0x80c8ba99,
+ 0xf32d5eb1,
+ 0xdd8e02a9,
+ 0x14e696e1,
+ 0x3a45caf9,
+ 0x49a02ed1,
+ 0x670372c9,
+ },
+
+ {
+ 0x0,
+ 0x87a6cb43,
+ 0xd43c90c7,
+ 0x539a5b84,
+ 0x730827cf,
+ 0xf4aeec8c,
+ 0xa734b708,
+ 0x20927c4b,
+ 0xe6104f9e,
+ 0x61b684dd,
+ 0x322cdf59,
+ 0xb58a141a,
+ 0x95186851,
+ 0x12bea312,
+ 0x4124f896,
+ 0xc68233d5,
+ },
+
+ {
+ 0x0,
+ 0x1751997d,
+ 0x2ea332fa,
+ 0x39f2ab87,
+ 0x5d4665f4,
+ 0x4a17fc89,
+ 0x73e5570e,
+ 0x64b4ce73,
+ 0xba8ccbe8,
+ 0xaddd5295,
+ 0x942ff912,
+ 0x837e606f,
+ 0xe7caae1c,
+ 0xf09b3761,
+ 0xc9699ce6,
+ 0xde38059b,
+ },
+
+ {
+ 0x0,
+ 0xae689191,
+ 0x87a02563,
+ 0x29c8b4f2,
+ 0xd4314c87,
+ 0x7a59dd16,
+ 0x539169e4,
+ 0xfdf9f875,
+ 0x73139f4f,
+ 0xdd7b0ede,
+ 0xf4b3ba2c,
+ 0x5adb2bbd,
+ 0xa722d3c8,
+ 0x94a4259,
+ 0x2082f6ab,
+ 0x8eea673a,
+ },
+
+ {
+ 0x0,
+ 0xe6273e9e,
+ 0x173f7b7d,
+ 0xf11845e3,
+ 0x2e7ef6fa,
+ 0xc859c864,
+ 0x39418d87,
+ 0xdf66b319,
+ 0x5cfdedf4,
+ 0xbadad36a,
+ 0x4bc29689,
+ 0xade5a817,
+ 0x72831b0e,
+ 0x94a42590,
+ 0x65bc6073,
+ 0x839b5eed,
+ },
+
+ {
+ 0x0,
+ 0xb9fbdbe8,
+ 0xa886b191,
+ 0x117d6a79,
+ 0x8a7c6563,
+ 0x3387be8b,
+ 0x22fad4f2,
+ 0x9b010f1a,
+ 0xcf89cc87,
+ 0x7672176f,
+ 0x670f7d16,
+ 0xdef4a6fe,
+ 0x45f5a9e4,
+ 0xfc0e720c,
+ 0xed731875,
+ 0x5488c39d,
+ },
+
+ {
+ 0x0,
+ 0x44629f4f,
+ 0x88c53e9e,
+ 0xcca7a1d1,
+ 0xcafb7b7d,
+ 0x8e99e432,
+ 0x423e45e3,
+ 0x65cdaac,
+ 0x4e87f0bb,
+ 0xae56ff4,
+ 0xc642ce25,
+ 0x8220516a,
+ 0x847c8bc6,
+ 0xc01e1489,
+ 0xcb9b558,
+ 0x48db2a17,
+ },
+
+ {
+ 0x0,
+ 0x9d0fe176,
+ 0xe16ec4ad,
+ 0x7c6125db,
+ 0x19ac8f1b,
+ 0x84a36e6d,
+ 0xf8c24bb6,
+ 0x65cdaac0,
+ 0x33591e36,
+ 0xae56ff40,
+ 0xd237da9b,
+ 0x4f383bed,
+ 0x2af5912d,
+ 0xb7fa705b,
+ 0xcb9b5580,
+ 0x5694b4f6,
+ },
+
+ {
+ 0x0,
+ 0x66b23c6c,
+ 0xcd6478d8,
+ 0xabd644b4,
+ 0x41b9f7f1,
+ 0x270bcb9d,
+ 0x8cdd8f29,
+ 0xea6fb345,
+ 0x8373efe2,
+ 0xe5c1d38e,
+ 0x4e17973a,
+ 0x28a5ab56,
+ 0xc2ca1813,
+ 0xa478247f,
+ 0xfae60cb,
+ 0x691c5ca7,
+ },
+
+ {
+ 0x0,
+ 0xdd96d985,
+ 0x605cb54b,
+ 0xbdca6cce,
+ 0xc0b96a96,
+ 0x1d2fb313,
+ 0xa0e5dfdd,
+ 0x7d730658,
+ 0x5a03d36d,
+ 0x87950ae8,
+ 0x3a5f6626,
+ 0xe7c9bfa3,
+ 0x9abab9fb,
+ 0x472c607e,
+ 0xfae60cb0,
+ 0x2770d535,
+ },
+
+ {
+ 0x0,
+ 0xb407a6da,
+ 0xb37e4bf5,
+ 0x779ed2f,
+ 0xbd8d91ab,
+ 0x98a3771,
+ 0xef3da5e,
+ 0xbaf47c84,
+ 0xa06a2517,
+ 0x146d83cd,
+ 0x13146ee2,
+ 0xa713c838,
+ 0x1de7b4bc,
+ 0xa9e01266,
+ 0xae99ff49,
+ 0x1a9e5993,
+ },
+
+ {
+ 0x0,
+ 0x9ba54c6f,
+ 0xec3b9e9f,
+ 0x779ed2f0,
+ 0x3063b7f,
+ 0x98a37710,
+ 0xef3da5e0,
+ 0x7498e98f,
+ 0x60c76fe,
+ 0x9da93a91,
+ 0xea37e861,
+ 0x7192a40e,
+ 0x50a4d81,
+ 0x9eaf01ee,
+ 0xe931d31e,
+ 0x72949f71,
+ },
+
+ {
+ 0x0,
+ 0xc18edfc,
+ 0x1831dbf8,
+ 0x14293604,
+ 0x3063b7f0,
+ 0x3c7b5a0c,
+ 0x28526c08,
+ 0x244a81f4,
+ 0x60c76fe0,
+ 0x6cdf821c,
+ 0x78f6b418,
+ 0x74ee59e4,
+ 0x50a4d810,
+ 0x5cbc35ec,
+ 0x489503e8,
+ 0x448dee14,
+ },
+
+ {
+ 0x0,
+ 0xc18edfc0,
+ 0x586cb9c1,
+ 0x99e26601,
+ 0xb0d97382,
+ 0x7157ac42,
+ 0xe8b5ca43,
+ 0x293b1583,
+ 0xbac3e145,
+ 0x7b4d3e85,
+ 0xe2af5884,
+ 0x23218744,
+ 0xa1a92c7,
+ 0xcb944d07,
+ 0x52762b06,
+ 0x93f8f4c6,
+ },
+
+ {
+ 0x0,
+ 0xaef6c4cb,
+ 0x869c8fd7,
+ 0x286a4b1c,
+ 0xd64819ef,
+ 0x78bedd24,
+ 0x50d49638,
+ 0xfe2252f3,
+ 0x77e1359f,
+ 0xd917f154,
+ 0xf17dba48,
+ 0x5f8b7e83,
+ 0xa1a92c70,
+ 0xf5fe8bb,
+ 0x2735a3a7,
+ 0x89c3676c,
+ },
+
+ {
+ 0x0,
+ 0xefc26b3e,
+ 0x4f5d03d,
+ 0xeb37bb03,
+ 0x9eba07a,
+ 0xe629cb44,
+ 0xd1e7047,
+ 0xe2dc1b79,
+ 0x13d740f4,
+ 0xfc152bca,
+ 0x172290c9,
+ 0xf8e0fbf7,
+ 0x1a3ce08e,
+ 0xf5fe8bb0,
+ 0x1ec930b3,
+ 0xf10b5b8d,
+ },
+
+ {
+ 0x0,
+ 0x27ae81e8,
+ 0x4f5d03d0,
+ 0x68f38238,
+ 0x9eba07a0,
+ 0xb9148648,
+ 0xd1e70470,
+ 0xf6498598,
+ 0xe6050901,
+ 0xc1ab88e9,
+ 0xa9580ad1,
+ 0x8ef68b39,
+ 0x78bf0ea1,
+ 0x5f118f49,
+ 0x37e20d71,
+ 0x104c8c99,
+ },
+
+ {
+ 0x0,
+ 0x177b1443,
+ 0x2ef62886,
+ 0x398d3cc5,
+ 0x5dec510c,
+ 0x4a97454f,
+ 0x731a798a,
+ 0x64616dc9,
+ 0xbbd8a218,
+ 0xaca3b65b,
+ 0x952e8a9e,
+ 0x82559edd,
+ 0xe634f314,
+ 0xf14fe757,
+ 0xc8c2db92,
+ 0xdfb9cfd1,
+ },
+
+ {
+ 0x0,
+ 0xacc04271,
+ 0x82f182a3,
+ 0x2e31c0d2,
+ 0xde920307,
+ 0x72524176,
+ 0x5c6381a4,
+ 0xf0a3c3d5,
+ 0x6655004f,
+ 0xca95423e,
+ 0xe4a482ec,
+ 0x4864c09d,
+ 0xb8c70348,
+ 0x14074139,
+ 0x3a3681eb,
+ 0x96f6c39a,
+ },
+
+ {
+ 0x0,
+ 0xccaa009e,
+ 0x4225077d,
+ 0x8e8f07e3,
+ 0x844a0efa,
+ 0x48e00e64,
+ 0xc66f0987,
+ 0xac50919,
+ 0xd3e51bb5,
+ 0x1f4f1b2b,
+ 0x91c01cc8,
+ 0x5d6a1c56,
+ 0x57af154f,
+ 0x9b0515d1,
+ 0x158a1232,
+ 0xd92012ac,
+ },
+
+ {
+ 0x0,
+ 0x7cbb312b,
+ 0xf9766256,
+ 0x85cd537d,
+ 0x299dc2ed,
+ 0x5526f3c6,
+ 0xd0eba0bb,
+ 0xac509190,
+ 0x533b85da,
+ 0x2f80b4f1,
+ 0xaa4de78c,
+ 0xd6f6d6a7,
+ 0x7aa64737,
+ 0x61d761c,
+ 0x83d02561,
+ 0xff6b144a,
+ },
+
+ {
+ 0x0,
+ 0xa6770bb4,
+ 0x979f1129,
+ 0x31e81a9d,
+ 0xf44f2413,
+ 0x52382fa7,
+ 0x63d0353a,
+ 0xc5a73e8e,
+ 0x33ef4e67,
+ 0x959845d3,
+ 0xa4705f4e,
+ 0x20754fa,
+ 0xc7a06a74,
+ 0x61d761c0,
+ 0x503f7b5d,
+ 0xf64870e9,
+ },
+
+ {
+ 0x0,
+ 0x67de9cce,
+ 0xcfbd399c,
+ 0xa863a552,
+ 0x440b7579,
+ 0x23d5e9b7,
+ 0x8bb64ce5,
+ 0xec68d02b,
+ 0x8816eaf2,
+ 0xefc8763c,
+ 0x47abd36e,
+ 0x20754fa0,
+ 0xcc1d9f8b,
+ 0xabc30345,
+ 0x3a0a617,
+ 0x647e3ad9,
+ },
+
+ {
+ 0x0,
+ 0xcb5cd3a5,
+ 0x4dc8a10b,
+ 0x869472ae,
+ 0x9b914216,
+ 0x50cd91b3,
+ 0xd659e31d,
+ 0x1d0530b8,
+ 0xec53826d,
+ 0x270f51c8,
+ 0xa19b2366,
+ 0x6ac7f0c3,
+ 0x77c2c07b,
+ 0xbc9e13de,
+ 0x3a0a6170,
+ 0xf156b2d5,
+ },
+
+ {
+ 0x0,
+ 0x3d6029b,
+ 0x7ac0536,
+ 0x47a07ad,
+ 0xf580a6c,
+ 0xc8e08f7,
+ 0x8f40f5a,
+ 0xb220dc1,
+ 0x1eb014d8,
+ 0x1d661643,
+ 0x191c11ee,
+ 0x1aca1375,
+ 0x11e81eb4,
+ 0x123e1c2f,
+ 0x16441b82,
+ 0x15921919,
+ },
+
+ {
+ 0x0,
+ 0x3d6029b0,
+ 0x7ac05360,
+ 0x47a07ad0,
+ 0xf580a6c0,
+ 0xc8e08f70,
+ 0x8f40f5a0,
+ 0xb220dc10,
+ 0x30704bc1,
+ 0xd106271,
+ 0x4ab018a1,
+ 0x77d03111,
+ 0xc5f0ed01,
+ 0xf890c4b1,
+ 0xbf30be61,
+ 0x825097d1,
+ },
+
+ {
+ 0x0,
+ 0x60e09782,
+ 0xc1c12f04,
+ 0xa121b886,
+ 0x58f35849,
+ 0x3813cfcb,
+ 0x9932774d,
+ 0xf9d2e0cf,
+ 0xb1e6b092,
+ 0xd1062710,
+ 0x70279f96,
+ 0x10c70814,
+ 0xe915e8db,
+ 0x89f57f59,
+ 0x28d4c7df,
+ 0x4834505d,
+ },
+
+ {
+ 0x0,
+ 0xb8bc6765,
+ 0xaa09c88b,
+ 0x12b5afee,
+ 0x8f629757,
+ 0x37def032,
+ 0x256b5fdc,
+ 0x9dd738b9,
+ 0xc5b428ef,
+ 0x7d084f8a,
+ 0x6fbde064,
+ 0xd7018701,
+ 0x4ad6bfb8,
+ 0xf26ad8dd,
+ 0xe0df7733,
+ 0x58631056,
+ },
+
+ {
+ 0x0,
+ 0x5019579f,
+ 0xa032af3e,
+ 0xf02bf8a1,
+ 0x9b14583d,
+ 0xcb0d0fa2,
+ 0x3b26f703,
+ 0x6b3fa09c,
+ 0xed59b63b,
+ 0xbd40e1a4,
+ 0x4d6b1905,
+ 0x1d724e9a,
+ 0x764dee06,
+ 0x2654b999,
+ 0xd67f4138,
+ 0x866616a7,
+ },
+
+ {
+ 0x0,
+ 0x1c26a37,
+ 0x384d46e,
+ 0x246be59,
+ 0x709a8dc,
+ 0x6cbc2eb,
+ 0x48d7cb2,
+ 0x54f1685,
+ 0xe1351b8,
+ 0xfd13b8f,
+ 0xd9785d6,
+ 0xc55efe1,
+ 0x91af964,
+ 0x8d89353,
+ 0xa9e2d0a,
+ 0xb5c473d,
+ },
+
+ {
+ 0x0,
+ 0x1c26a370,
+ 0x384d46e0,
+ 0x246be590,
+ 0x709a8dc0,
+ 0x6cbc2eb0,
+ 0x48d7cb20,
+ 0x54f16850,
+ 0xe1351b80,
+ 0xfd13b8f0,
+ 0xd9785d60,
+ 0xc55efe10,
+ 0x91af9640,
+ 0x8d893530,
+ 0xa9e2d0a0,
+ 0xb5c473d0,
+ },
+
+ {
+ 0x0,
+ 0x191b3141,
+ 0x32366282,
+ 0x2b2d53c3,
+ 0x646cc504,
+ 0x7d77f445,
+ 0x565aa786,
+ 0x4f4196c7,
+ 0xc8d98a08,
+ 0xd1c2bb49,
+ 0xfaefe88a,
+ 0xe3f4d9cb,
+ 0xacb54f0c,
+ 0xb5ae7e4d,
+ 0x9e832d8e,
+ 0x87981ccf,
+ },
+
+ {
+ 0x0,
+ 0x4ac21251,
+ 0x958424a2,
+ 0xdf4636f3,
+ 0xf0794f05,
+ 0xbabb5d54,
+ 0x65fd6ba7,
+ 0x2f3f79f6,
+ 0x3b83984b,
+ 0x71418a1a,
+ 0xae07bce9,
+ 0xe4c5aeb8,
+ 0xcbfad74e,
+ 0x8138c51f,
+ 0x5e7ef3ec,
+ 0x14bce1bd,
+ },
+
+ {
+ 0x0,
+ 0x77073096,
+ 0xee0e612c,
+ 0x990951ba,
+ 0x76dc419,
+ 0x706af48f,
+ 0xe963a535,
+ 0x9e6495a3,
+ 0xedb8832,
+ 0x79dcb8a4,
+ 0xe0d5e91e,
+ 0x97d2d988,
+ 0x9b64c2b,
+ 0x7eb17cbd,
+ 0xe7b82d07,
+ 0x90bf1d91,
+ },
+
+ {
+ 0x0,
+ 0x1db71064,
+ 0x3b6e20c8,
+ 0x26d930ac,
+ 0x76dc4190,
+ 0x6b6b51f4,
+ 0x4db26158,
+ 0x5005713c,
+ 0xedb88320,
+ 0xf00f9344,
+ 0xd6d6a3e8,
+ 0xcb61b38c,
+ 0x9b64c2b0,
+ 0x86d3d2d4,
+ 0xa00ae278,
+ 0xbdbdf21c,
+ },
+ };
+
+ const int num_nibbles_parallel = 64;
+
+ const int num_sections = accessor_isz / (num_nibbles_parallel /
+ 2); // how many loop iterations
+ unsigned int result = ~0;
+
+ for (int i = 0; i < num_sections; i++) {
+ unsigned int result_update_odd = 0;
+ unsigned int result_update_even = 0;
+// which 4 bit chunk within the section -- this loop can be unrolled, the
+// total update for the crc is the xor of the updates from the nibbles
+ #pragma unroll
+ for (int nib = 0; nib < num_nibbles_parallel; nib++) {
+ unsigned char this_input_nibble =
+ (acc_pibuf[(i * num_nibbles_parallel + nib) / 2] >>
+ (4 * (nib % 2)));
+ unsigned char this_result_nibble =
+ (nib < 8) ? (result >> (4 * nib)) : 0;
+ unsigned char this_table_index =
+ this_input_nibble ^ this_result_nibble;
+ if (nib % 2) {
+ result_update_odd ^= table64[nib][this_table_index & 0xf];
+ } else {
+ result_update_even ^= table64[nib][this_table_index & 0xf];
+ }
+ }
+ result = result_update_odd ^ result_update_even;
+ }
+
+ accresult_crc[0] = ~result;
+ });
+ });
+
+ e_lz = q.submit([&](handler &h) {
+ auto accessor_isz = block_size;
+ auto acc_pibuf = pibuf->get_access(h);
+
+ h.single_task>([=]() [[intel::kernel_args_restrict]] {
+ //-------------------------------------
+ // Hash Table(s)
+ //-------------------------------------
+
+ [[intelfpga::singlepump]] [[intelfpga::numbanks(kVec)]] [
+ [intelfpga::max_replicates(kVec)]] struct {
+ unsigned char s[kLen];
+ } dictionary[kDepth][kVec];
+
+ [[intelfpga::singlepump]] [[intelfpga::numbanks(kVec)]] [
+ [intelfpga::max_replicates(
+ kVec)]] unsigned int dict_offset[kDepth][kVec];
+
+ // Initialize history to empty.
+ for (int i = 0; i < kDepth; i++) {
+ Unroller<0, kVec>::step([&](int k) { dict_offset[i][k] = 0; });
+ }
+
+ // This is the window of data on which we look for matches
+ // We fetch twice our data size because we have kVec offsets
+ unsigned char current_window[kVecX2];
+
+ // This is the window of data on which we look for matches
+ // We fetch twice our data size because we have kVec offsets
+ unsigned char compare_window[kLen][kVec][kVec];
+ // kVec bytes per dict----------| | |
+ // kVec dictionaries-----------------| |
+ // one for each curr win offset---------|
+
+ // load offset into these arrays
+ unsigned int compare_offset[kVec][kVec];
+ // one per kVec bytes----------| |
+ // one for each compwin-------------|
+
+ // Initialize input stream position
+ unsigned int inpos_minus_vec_div_16 = 0;
+
+ // this is ceiling of (insize-kVec)/16, original comparison was
+ // inpos < insize, now inpos is carried as (inpos-kVec)/16 so this is what
+ // we compare to
+ unsigned int insize_compare = (accessor_isz) / kVec;
+
+ int ctr = insize_compare = insize_compare - 1;
+
+ char first_valid_pos = 0;
+
+ struct DistLen dist_offs_data;
+
+ int distchan_ndx = 0;
+ size_t inpos = 0;
+
+ // load in new data
+ struct LzInput in;
+ Unroller<0, kVec>::step([&](int i) { in.data[i] = acc_pibuf[inpos++]; });
+
+ Unroller<0, kVec>::step(
+ [&](int i) { current_window[i + kVec] = in.data[i]; });
+
+ do {
+ //-----------------------------
+ // Prepare current window
+ //-----------------------------
+
+ // shift current window
+ Unroller<0, kVec>::step(
+ [&](int i) { current_window[i] = current_window[i + kVec]; });
+
+ // load in new data
+ Unroller<0, kVec>::step(
+ [&](int i) { in.data[i] = acc_pibuf[inpos++]; });
+
+ Unroller<0, kVec>::step(
+ [&](int i) { current_window[kVec + i] = in.data[i]; });
+
+ //-----------------------------
+ // Compute hash
+ //-----------------------------
+
+ unsigned short hash[kVec];
+
+ Unroller<0, kVec>::step([&](int i) {
+ hash[i] = (current_window[i] ^ (current_window[i + 1] << 6) ^
+ (current_window[i + 2] << 2) ^ current_window[i + 3]) &
+ kHashMask;
+ });
+
+ //-----------------------------
+ // Dictionary look-up
+ //-----------------------------
+
+ // loop over kVec compare windows, each has a different hash
+ Unroller<0, kVec>::step([&](int i) {
+ // loop over all kVec bytes
+ Unroller<0, kLen>::step([&](int j) {
+ Unroller<0, kVec>::step([&](int k) {
+ compare_window[k][j][i] = dictionary[hash[i]][j].s[k];
+ });
+ });
+ });
+
+ // loop over compare windows
+ Unroller<0, kVec>::step([&](int i) {
+ Unroller<0, kLen>::step([&](int j) {
+ // loop over frames in this compare window
+ // (they come from different dictionaries)
+ compare_offset[j][i] = dict_offset[hash[i]][j];
+ });
+ });
+
+ //-----------------------------
+ // Dictionary update
+ //-----------------------------
+
+ // loop over different dictionaries to store different frames
+ // store one frame per dictionary
+ // loop over kVec bytes to store
+ Unroller<0, kLen>::step([&](int i) {
+ Unroller<0, kVec>::step([&](int j) {
+ // store actual bytes
+ dictionary[hash[i]][i].s[j] = current_window[i + j];
+ });
+ });
+
+ Unroller<0, kVec>::step([&](int i) {
+ // loop over kVec different dictionaries and write one word to each
+ dict_offset[hash[i]][i] =
+ (inpos_minus_vec_div_16 << 4) |
+ i; // inpos - kVec + 0, we know that inpos - kVec has 0 as the 4
+ // lower bits so really just concatenate
+ });
+
+ //-----------------------------
+ // Match search
+ //-----------------------------
+
+ // arrays to store length, best length etc..
+ unsigned char length[kVec];
+ bool done[kVec];
+ char best_length[kVec];
+ unsigned int best_offset[kVec];
+
+ // initialize best_length
+ Unroller<0, kVec>::step([&](int i) {
+ best_length[i] = 0;
+ best_offset[i] = 0;
+ });
+
+ // loop over each comparison window frame
+ // one comes from each dictionary
+ Unroller<0, kVec>::step([&](int i) {
+ // initialize length and done
+ Unroller<0, kVec>::step([&](int l) {
+ length[l] = 0;
+ done[l] = 0;
+ });
+
+ // loop over each current window
+ Unroller<0, kVec>::step([&](int j) {
+ // loop over each char in the current window
+ // and corresponding char in comparison window
+ Unroller<0, kLen>::step([&](int k) {
+ bool comp =
+ current_window[k + j] == compare_window[k][i][j] && !done[j];
+ length[j] += comp;
+ done[j] = !comp;
+ });
+ });
+
+ // Check if this the best length
+ Unroller<0, kVec>::step([&](int m) {
+ bool update_best =
+ (length[m] > best_length[m]) && (compare_offset[i][m] != 0) &&
+ (((inpos_minus_vec_div_16 << kVecPow) | (i & (kVec - 1))) -
+ (compare_offset[i][m]) <
+ kMaxDistance);
+
+ unsigned int new_offset =
+ (((inpos_minus_vec_div_16 << kVecPow) | (m & (kVec - 1))) &
+ 0x7ffff) -
+ ((compare_offset[i][m] & 0x7ffff));
+
+ // Reconsider if new_offset is bigger than current offset, might
+ // take more bytes to encode
+ update_best = update_best && (length[m] == best_length[m]) &&
+ (new_offset > best_offset[m])
+ ? false
+ : update_best;
+
+ best_offset[m] = (update_best ? new_offset : best_offset[m]) &
+ 0x7ffff; // 19 bits is sufficient
+
+ best_length[m] = (update_best ? length[m] : best_length[m]) &
+ 0x1f; // 5 bits is sufficient
+ });
+ });
+
+ //-----------------------------
+ // Filter matches step 1
+ //-----------------------------
+
+ // remove matches with offsets that are <= 0: this means they're
+ // self-matching or didn't match and keep only the matches that, when
+ // encoded, take fewer bytes than the actual match length
+ Unroller<0, kVec>::step([&](int i) {
+ best_length[i] = (((best_length[i] & 0x1f) >= 3) &&
+ ((best_offset[i]) < kMaxDistance)
+ ? best_length[i]
+ : 0) &
+ 0x1f; // 5 bits is sufficient
+
+ // Second level filter - remove matches with len 3, greater than
+ // kTooFar
+ best_length[i] =
+ (((best_length[i] & 0x1f) == 3) && ((best_offset[i]) > kTooFar)
+ ? 0
+ : best_length[i]) &
+ 0x1f; // 5 bits is sufficient
+ // don't emmit matches for last iteration as some of the
+ // second part of the window might be undefined
+ if (ctr == 0) best_length[i] = 0;
+ });
+
+ //-----------------------------
+ // Assign first_valid_pos
+ //-----------------------------
+
+ // first_valid_pos is loop-carried, and tricky to compute. So first
+ // compute it speculatively in parallel for every possible value of the
+ // previous first_valid_pos.
+ char first_valid_pos_speculative[kVec];
+
+ Unroller<0, kVec>::step([&](int guess) {
+ unsigned char next_match_search = guess;
+ Unroller<0, kVec>::step([&](int i) {
+ unsigned int len = best_length[i];
+
+ // Skip to the next match
+ next_match_search =
+ i >= next_match_search && len > 0 ? i + len : next_match_search;
+ });
+
+ first_valid_pos_speculative[guess] =
+ next_match_search - kVec > 0 ? next_match_search - kVec : 0;
+ });
+
+ // For kVec=16 (the largest currently supported), this should be a 16:1
+ // mux, which is 2 6LUTs deep. For larger kVec, it will be worse.
+ unsigned char current_valid_pos = first_valid_pos;
+ first_valid_pos =
+ first_valid_pos_speculative[first_valid_pos & (kVec - 1)] &
+ (kVec -
+ 1); // first_valid_pos only needs 4 bits, make this explicit
+
+ // greedy match selection
+ Unroller<0, (kVec)>::step([&](int i) {
+ unsigned int len = best_length[i];
+ best_length[i] = i < current_valid_pos ? -1 : best_length[i];
+ // Skip to the next match
+ current_valid_pos =
+ i >= current_valid_pos && len > 0 ? i + len : current_valid_pos;
+ });
+
+ //-----------------------------
+ // Setup LZ dist/len pairs to push to Huffman encode kernel
+ //-----------------------------
+
+ Unroller<0, kVec>::step([&](int i) {
+ dist_offs_data.data[i] = 0;
+ dist_offs_data.len[i] = -1;
+ dist_offs_data.dist[i] = -1;
+ if (best_length[i] >= 0) {
+ dist_offs_data.data[i] = current_window[i];
+ dist_offs_data.len[i] = best_length[i];
+ dist_offs_data.dist[i] = best_offset[i];
+ }
+ });
+
+ acc_dist_channel::write(dist_offs_data);
+
+ // increment input position
+ inpos_minus_vec_div_16++;
+ distchan_ndx += 1;
+ ctr--;
+
+ } while (ctr >= 0);
+
+ const char lasti = accessor_isz - (accessor_isz & ~(kVec - 1));
+ const char firstpos = first_valid_pos;
+ Unroller<0, kVec>::step([&](unsigned char i) {
+ dist_offs_data.data[i] = 0;
+ dist_offs_data.len[i] = -1;
+ dist_offs_data.dist[i] = -1;
+ });
+
+ Unroller<0, kVec>::step([&](unsigned char i) {
+ bool pred =
+ ((i - firstpos) < (lasti - firstpos)) && ((i - firstpos) >= 0);
+ dist_offs_data.data[i] = pred ? current_window[i + kVec] : 0;
+ dist_offs_data.len[i] = pred ? 0 : -1;
+ });
+
+ acc_dist_channel_last::write(dist_offs_data);
+ });
+ });
+
+ e_huff = q.submit([&](handler &h) {
+ auto accessor_isz = block_size;
+ auto acc_gzip_out =
+ gzip_out_buf->get_access(h);
+ auto accessor_output = pobuf->get_access(h);
+ auto acc_eof = last_block ? 1 : 0;
+ h.single_task>([=
+ ]() [[intel::kernel_args_restrict]] {
+ unsigned int leftover[kVec] = {0};
+ Unroller<0, kVec>::step([&](int i) { leftover[i] = 0; });
+
+ unsigned short leftover_size = 0;
+
+ unsigned int outpos_huffman = 0;
+
+ int ctr = ((accessor_isz) / kVec) + 2;
+ int odx = 0;
+
+ // Add the gzip start block marker. Assumes static huffman trees.
+ leftover_size = 3;
+ leftover[0] = ((kStaticTrees << 1) + (acc_eof));
+
+ do {
+ struct DistLen in;
+ // init the input structure for the gzip end block marker.
+ // this is the very last data block to be encoded and written.
+ Unroller<0, kVec>::step([&](int i) {
+ in.len[i] = -1;
+ in.dist[i] = -1;
+ in.data[i] = 0;
+ });
+ in.len[0] = ctr == 1 ? -3 : -1;
+ in.data[0] = 0;
+
+ in = ctr > 2 ? acc_dist_channel::read()
+ : (ctr == 2 ? acc_dist_channel_last::read() : in);
+
+ struct HuffmanOutput outdata;
+ outdata.write = HufEnc(in.len, in.dist, in.data, outdata.data, leftover,
+ &leftover_size);
+
+ // prevent out of bounds write
+ if (((ctr == 0) || outdata.write) && (odx < accessor_isz)) {
+ Unroller<0, kVec * sizeof(unsigned int)>::step([&](int i) {
+ accessor_output[odx + i] =
+ (ctr == 0) ? (unsigned char)(leftover[(i >> 2) & 0xf] >>
+ ((i & 3) << 3))
+ : (unsigned char)(outdata.data[(i >> 2) & 0xf] >>
+ ((i & 3) << 3));
+ });
+ }
+
+ outpos_huffman = outdata.write ? outpos_huffman + 1 : outpos_huffman;
+ odx += outdata.write ? (sizeof(unsigned int) << kVecPow) : 0;
+
+ } while (ctr--);
+
+ // Store summary values from lz and huffman
+ acc_gzip_out[0].compression_sz =
+ (outpos_huffman * sizeof(unsigned int) * kVec) +
+ (leftover_size + 7) / 8;
+ });
+ });
+}
+
+void SubmitGzipTasks(queue &q,
+ size_t block_size, // size of block to compress.
+ buffer *pibuf, buffer *pobuf,
+ buffer *gzip_out_buf,
+ buffer *result_crc, bool last_block,
+ event &e_crc, event &e_lz, event &e_huff,
+ size_t engineID) {
+ // Statically declare the engines so that the hardware is created for them.
+ // But at run time, the host can dynamically select which engine(s) to use via
+ // engineID.
+ if (engineID == 0) {
+ SubmitGzipTasksSingleEngine<0>(q, block_size, pibuf, pobuf, gzip_out_buf,
+ result_crc, last_block, e_crc, e_lz, e_huff);
+ }
+
+ #if NUM_ENGINES > 1
+ if (engineID == 1) {
+ SubmitGzipTasksSingleEngine<1>(q, block_size, pibuf, pobuf, gzip_out_buf,
+ result_crc, last_block, e_crc, e_lz, e_huff);
+ }
+ #endif
+
+ // If this reference design is to be expanded to > 2 engines, declare them here.
+
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.hpp
new file mode 100755
index 0000000000..7de9a3ea17
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.hpp
@@ -0,0 +1,45 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#ifndef __GZIPKERNEL_H__
+#define __GZIPKERNEL_H__
+#pragma once
+
+#include
+
+using namespace cl::sycl;
+
+extern "C" void SubmitGzipTasks(
+ queue &sycl_device,
+ size_t block_size, // size of block to compress.
+ buffer *pibuf, buffer *pobuf,
+ buffer *gzip_out_buf,
+ buffer *current_crc, bool last_block, event &e_crc,
+ event &e_lz, event &e_huff, size_t engineID);
+
+#endif //__GZIPKERNEL_H__
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/kernels.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/kernels.hpp
new file mode 100755
index 0000000000..65f207bab7
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/kernels.hpp
@@ -0,0 +1,148 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#ifndef __KERNELS_H__
+#define __KERNELS_H__
+#pragma once
+
+#ifndef NUM_ENGINES
+ #define NUM_ENGINES 1
+#endif
+
+constexpr int kNumEngines = NUM_ENGINES;
+
+// kVecPow == 2 means kVec == 4.
+// kVecPow == 3 means kVec == 8.
+// kVecPow == 4 means kVec == 16.
+constexpr int kVecPow = 4;
+
+constexpr int kVec = 1 << kVecPow;
+constexpr int kVecX2 = 2 * kVec;
+
+constexpr int kHufTableSize = 256;
+
+// Maximum length of huffman codes
+constexpr int kMaxHuffcodeBits = 16;
+
+struct Uint2Gzip {
+ unsigned int y;
+ unsigned int x;
+};
+
+struct LzInput {
+ unsigned char data[kVec];
+};
+
+typedef struct DistLen {
+ unsigned char data[kVec];
+ char len[kVec];
+ short dist[kVec];
+} DistLen, *pdist_len_t;
+
+struct HuffmanOutput {
+ unsigned int data[kVec];
+ bool write;
+};
+
+struct TrailingOutput {
+ int bytecount_left;
+ int bytecount;
+ unsigned char bytes[kVec * sizeof(unsigned int)];
+};
+
+struct GzipOutInfo {
+ // final compressed block size
+ size_t compression_sz;
+ unsigned long crc;
+};
+
+// kLen must be == kVec
+constexpr int kLen = kVec;
+
+// depth of the dictionary buffers
+constexpr int kDepth = 512;
+
+// Assumes kDepth is a power of 2 number.
+constexpr int kHashMask = kDepth - 1;
+
+#define CONSTANT __constant
+
+constexpr int kDebug = 1;
+#define TRACE(x) \
+ do { \
+ if (kDebug) printf x; \
+ } while (0)
+
+constexpr int kStaticTrees = 1;
+
+typedef struct CtData {
+ unsigned short code;
+ unsigned short len;
+} CtData;
+
+constexpr int kMaxMatch = 258;
+constexpr int kMinMatch = 3;
+
+constexpr int kTooFar = 4096;
+
+// All codes must not exceed kMaxBits
+constexpr int kMaxBits = 15;
+
+// number of length codes, not counting the special kEndBlock code
+constexpr int kLengthCodes = 29;
+
+// number of literal bytes, 0..255
+constexpr int kLiterals = 256;
+
+// end of literal code block
+constexpr int kEndBlock = 256;
+
+// number of literal or length codes, including kEndBlock
+constexpr int kLCodes = (kLiterals + 1 + kLengthCodes);
+
+// number of distance codes
+constexpr int kDCodes = 30;
+
+// number of codes used to transfer the bit lengths
+constexpr int kBLCodes = 19;
+
+constexpr int kMaxDistance = ((32 * 1024));
+
+constexpr int kMinBufferSize = 16384;
+
+struct DictString {
+ unsigned char s[kLen];
+};
+
+// Mapping from a distance to a distance code. dist is the distance - 1 and
+// must not have side effects. dist_code[256] and dist_code[257] are never
+// used.
+#define d_code(dist) \
+ ((dist) < 256 ? dist_code[dist] : dist_code[256 + ((dist) >> 7)])
+
+#endif //__KERNELS_H__
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/CMakeLists.txt
new file mode 100755
index 0000000000..81cd1c747a
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+
+cmake_minimum_required (VERSION 2.8)
+
+project(QRD)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/README.md b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/README.md
new file mode 100755
index 0000000000..34288260cf
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/README.md
@@ -0,0 +1,239 @@
+# QR Decomposition of Matrices
+This DPC++ reference design demonstrates high-performance QR decomposition of complex matrices on FPGA.
+
+***Documentation***: The [FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a resource for general target-independent DPC++ programming.
+
+| Optimized for | Description
+--- |---
+| OS | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® PAC with Intel Stratix® 10 SX FPGA;
Intel Xeon® CPU E5-1650 v2 @ 3.50GHz (host machine)
+| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit
+| What you will learn | Implementing a high performance FPGA version of the Gram-Schmidt QR decomposition algorithm.
+| Time to complete | 1 hr (not including compile time)
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+
+**Performance**
+Please refer to performance disclaimer at the end of this README.
+
+| Device | Throughput
+|:--- |:---
+| Intel® PAC with Intel Arria® 10 GX FPGA | 25k matrices/s for matrices of size 128 * 128
+| Intel® PAC with Intel Stratix® 10 SX FPGA | 7k matrices/s for matrices of size 256 * 256
+
+
+## Purpose
+
+This FPGA reference design demonstrates QR decomposition of matrices of complex numbers, a common operation employed in linear algebra. Matrix _A_ (input) is decomposed into a product of an orthogonal matrix _Q_ and an upper triangular matrix _R_.
+
+The algorithms employed by the reference design are the Gram-Schmidt QR decomposition algorithm and the thin QR factorization method. Background information on these algorithms can be found in Wikipedia's [QR decomposition](https://en.wikipedia.org/wiki/QR_decomposition) article. The original algorithm has been modified and optimized for performance on FPGAs in this implementation.
+
+QR decomposition is used extensively in signal processing applications such as beamforming, multiple-input multiple-output (MIMO) processing, and Space Time Adaptive Processing (STAP).
+
+
+### Matrix dimensions and FPGA resources
+
+The QR decomposition algorithm factors a complex _m_×_n_ matrix, where _m_ ≥ _n_. The algorithm computes the vector dot product of two columns of the matrix. In our FPGA implementation, the dot product is computed in a loop over the _m_ elements of the column. The loop is fully unrolled to maximize throughput. As a result, *m* complex multiplication operations are performed in parallel on the FPGA, followed by sequential additions to compute the dot product result.
+
+We use the compiler flag `-fp-relaxed`, which permits the compiler to reorder floating point additions (i.e. to assume that floating point addition is commutative). The compiler uses this freedom to reorder the additions so that the dot product arithmetic can be optimally implemented using the FPGA's specialized floating point DSP (Digital Signal Processing) hardware.
+
+With this optimization, our FPGA implementation requires 4*m* DSPs to compute the complex floating point dot product. Thus, the matrix size is constrained by the total FPGA DSP resources available. Note that this upper bound is a consequence of this particular implementation.
+
+By default, the design is parameterized to process 128 × 128 matrices when compiled targeting Intel® PAC with Intel Arria® 10 GX FPGA. It is parameterized to process 256 × 256 matrices when compiled targeting Intel® PAC with Intel Stratix® 10 SX FPGA, a larger device.
+
+
+## Key Implementation Details
+| Kernel | Description
+--- |---
+| QRD | Implements a modified Gram-Schmidt QR decomposition algorithm.
+
+To optimize the performance-critical loop in its algorithm, the design leverages concepts discussed in the following FPGA tutorials:
+* **Triangular Loop Optimization** (triangular_loop)
+* **Explicit Pipelining with `fpga_reg`** (fpga_register)
+* **Loop `ivdep` Attribute** (loop_ivdep)
+* **Unrolling Loops** (loop_unroll)
+
+ The key optimization techniques used are as follows:
+ 1. Refactoring the algorithm to merge two dot products into one, reducing the total number of dot products needed to three from two. This helps us reduce the DSPs needed for the implementation.
+ 2. Converting the nested loop into a single merged loop and applying Triangular Loop optimizations. This allows us to generate a design that is very well pipelined.
+ 3. Fully vectorizing the dot products using loop unrolling.
+ 4. Using the compiler flag -Xsfp-relaxed to re-order floating point operations and allowing the inference of a specialised dot-product DSP. This further reduces the number of DSP blocks needed by the implementation, the overall latency, and pipeline depth.
+ 5. Using an efficient memory banking scheme to generate high performance hardware.
+ 6. Using the `fpga_reg` attribute to insert more pipeline stages where needed to improve the frequency achieved by the design.
+
+## License
+This code sample is licensed under MIT license.
+
+## Building the Reference Design
+
+### Include Files
+The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Code Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 24h.
+
+### On a Linux* System
+1. Install the design into a directory `build` from the design directory by running `cmake`:
+
+ ```
+ mkdir build
+ cd build
+ ```
+
+ If you are compiling for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:
+
+ ```
+ cmake ..
+ ```
+
+ If instead you are compiling for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+ ```
+ cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+ ```
+
+2. Compile the design through the generated `Makefile`. The following targets are provided and they match the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device).
+
+ ```
+ make fpga_emu
+ ```
+
+ * Generate HTML performance report. Find the report in `qrd_report.prj/reports/report.html`directory.
+
+ ```
+ make report
+ ```
+
+ * Compile for FPGA hardware (longer compile time, targets FPGA device).
+
+ ```
+ make fpga
+ ```
+
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead.
+
+Note: Ensure that Microsoft Visual Studio* (2017, or 2019 Version 16.4 or newer) with "Desktop development with C++" workload is installed on your system.
+
+1. Enter source file directory.
+
+```
+cd src
+```
+
+2. Compile the design. The following targets are provided and they match the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device).
+
+ ```
+ ninja fpga_emu
+ ```
+
+ * Generate HTML performance report. Find the report in `../src/qrd_report.prj/reports/report.html`directory.
+
+ ```
+ ninja report
+ ```
+
+ If you are targeting the Intel® PAC with Intel Stratix® 10 SX FPGA, please use the following target and find the report in `../src/qrd_s10_pac_report.prj/reports/report.html`.
+
+ ```
+ ninja report_s10_pac
+ ```
+
+ * **Not supported yet:** Compile and run on an FPGA hardware.
+
+### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this Reference Design in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Running the Reference Design
+You can apply QR decomposition to a number of matrices as shown below. This step performs the following:
+* Generates the number of random matrices specified as the command line argument (defaults to 1).
+* Computes QR decomposition on all matrices.
+* Evaluates performance.
+NOTE: The design is optimized to perform best when run on a large number of matrices, where the total number of matrices is a power of 2.
+
+
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU).
+ ```
+ ./qrd.fpga_emu (Linux)
+ qrd.fpga_emu.exe (Windows)
+ ```
+
+2. Run the sample on the FPGA device. It is recommended to pass in an optional argument (as shown) when invoking the sample on hardware. Otherwise, the performance will not be representative.
+ ```
+ ./qrd.fpga 40960 (Linux)
+ ```
+### Application Parameters
+
+| Argument | Description
+--- |---
+| `` | Optional argument that specifies the number of matrices to decompose. Its default value is `1`.
+
+### Example of Output
+
+Example output when running on Intel® PAC with Intel Arria® 10 GX FPGA for 32768 matrices (each of consisting of 128*128 complex numbers):
+
+```
+Device name: pac_a10 : Intel PAC Platform (pac_f000000)
+Generating 32768 random matrices
+Running QR decomposition of 32768 matrices repeatedly
+ Total duration: 41.3763 s
+Throughput: 25.3425k matrices/s
+Verifying results on matrix 0 16384 32767
+PASSED
+```
+
+Example output when running on Intel® PAC with Intel Stratix® 10 SX FPGA for 40960 matrices (each of consisting of 256*256 complex numbers):
+
+```
+Device name: pac_s10 : Intel PAC Platform (pac_f100000)
+Generating 4096 random matrices
+Running QR decomposition of 4096 matrices repeatedly
+ Total duration: 17.3197 s
+Throughput: 7.5678k matrices/s
+Verifying results on matrix 0 2048 4095
+PASSED
+```
+
+## Additional Design Information
+
+### Compiler Flags Used
+
+| Flag | Description
+--- |---
+`-Xshardware` | Target FPGA hardware (as opposed to FPGA emulator)
+`-Xsclock=330MHz` | The FPGA backend attempts to achieve 330 MHz
+`-Xsfp-relaxed` | Allows the FPGA backend to re-order floating point arithmetic operations (e.g. permit assuming (a + b + c) == (c + a + b) )
+`-Xsparallel=2` | Use 2 cores when compiling the bitstream through Quartus
+`-Xsseed` | Specifies the Quartus compile seed, to yield slightly higher fmax
+`-DROWS_COMPONENT` | Specifies the number of rows of the matrix
+`-DCOLS_COMPONENT` | Specifies the number of columns of the matrix
+`-DFIXED_ITERATIONS` | Used to set the ivdep safelen attribute for the performance critical triangular loop
+
+NOTE: The values for `seed`, `FIXED_ITERATIONS`, `ROWS_COMPONENT`, `COLS_COMPONENT` are set according to the board being targeted.
+
+### Performance disclaimers
+
+Tests document performance of components on a particular test, in specific systems. Differences in hardware, software, or configuration will affect actual performance. Consult other sources of information to evaluate performance as you consider your purchase. For more complete information about performance and benchmark results, visit [www.intel.com/benchmarks](www.intel.com/benchmarks).
+
+Performance results are based on testing as of July 29, 2020 and may not reflect all publicly available security updates. See configuration disclosure for details. No product or component can be absolutely secure.
+
+Intel technologies’ features and benefits depend on system configuration and may require enabled hardware, software or service activation. Performance varies depending on system configuration. Check with your system manufacturer or retailer or learn more at [intel.com](www.intel.com).
+
+The performance was measured by Intel on July 29, 2020.
+
+Intel and the Intel logo are trademarks of Intel Corporation or its subsidiaries in the U.S. and/or other countries.
+
+(C) Intel Corporation.
+
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.sln b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.sln
new file mode 100755
index 0000000000..b5e086d1f5
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.sln
@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "qrd", "qrd.vcxproj", "{ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|x64 = Debug|x64
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Debug|x64.ActiveCfg = Debug|x64
+ {ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Debug|x64.Build.0 = Debug|x64
+ {ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Release|x64.ActiveCfg = Release|x64
+ {ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ SolutionGuid = {97D1BD74-AAAB-4835-8F00-37A58B70871A}
+ EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.vcxproj b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.vcxproj
new file mode 100755
index 0000000000..95a7067c03
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.vcxproj
@@ -0,0 +1,170 @@
+
+
+
+
+ Debug
+ x64
+
+
+ Release
+ x64
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 15.0
+ {acde6b7a-6f9a-428e-b040-cedc5b1e2c79}
+ Win32Proj
+ qrd
+ $(WindowsSDKVersion.Replace("\",""))
+
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true
+
+
+ true
+
+
+ false
+
+
+ false
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR -DFIXED_ITERATIONS=64 -DROWS_COMPONENT=128 -DCOLS_COMPONENT=128 %(AdditionalOptions)
+
+
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ -Xsclock=330MHz;-Xsfp-relaxed;-Xsparallel=2
+
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR -DFIXED_ITERATIONS=64 -DROWS_COMPONENT=128 -DCOLS_COMPONENT=128 %(AdditionalOptions)
+
+
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+ -Xsclock=330MHz;-Xsfp-relaxed;-Xsparallel=2
+
+
+
+
+
+
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/sample.json b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/sample.json
new file mode 100755
index 0000000000..aa107a266e
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/sample.json
@@ -0,0 +1,57 @@
+{
+ "guid": "3228581F-9DF8-4696-9B1C-0B31286B97C3",
+ "name": "QR Decomposition of Matrices on FPGA",
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Reference Designs"],
+ "description": "Reference design demonstrating high-performance QR decomposition of complex matrices on FPGA",
+ "toolchain": ["dpcpp"],
+ "os": ["linux", "windows"],
+ "builder": ["ide", "cmake"],
+ "targetDevice": ["FPGA"],
+ "languages": [{"cpp":{}}],
+ "ciTests": {
+ "linux": [
+ {
+ "id": "fpga_emu",
+ "env": [
+ "export CL_CONFIG_CPU_FORCE_PRIVATE_MEM_SIZE=32MB"
+ ],
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make fpga_emu",
+ "./qrd.fpga_emu"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make report"
+ ]
+ }
+ ],
+ "windows": [
+ {
+ "id": "fpga_emu",
+ "env": [
+ "set CL_CONFIG_CPU_FORCE_PRIVATE_MEM_SIZE=32MB"
+ ],
+ "steps": [
+ "cd src",
+ "ninja fpga_emu",
+ "qrd.fpga_emu.exe"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "cd src",
+ "ninja report"
+ ]
+ }
+ ]
+ }
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/CMakeLists.txt
new file mode 100755
index 0000000000..5003e6a357
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/CMakeLists.txt
@@ -0,0 +1,129 @@
+set(DEVICE_SOURCE_FILE qrd.cpp)
+set(DEVICE_HEADER_FILE qrd.hpp)
+set(HOST_SOURCE_FILE qrd_demo.cpp)
+set(TARGET_NAME qrd)
+
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+set(REPORTS_TARGET ${TARGET_NAME}_report)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Design specific constant values
+set(ROWS_COMPONENT_A10 128)
+set(COLS_COMPONENT_A10 128)
+
+set(ROWS_COMPONENT_S10 256)
+set(COLS_COMPONENT_S10 256)
+
+set(FIXED_ITERATIONS_A10 64)
+set(FIXED_ITERATIONS_S10 105)
+
+set(SEED_A10 5)
+set(SEED_S10 1)
+
+# Set parameter values assuming target is Intel(R) PAC with Intel Arria(R) 10 GX FPGA
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+SET(FIXED_ITERATIONS ${FIXED_ITERATIONS_A10})
+SET(SEED ${SEED_A10})
+SET(ROWS_COMPONENT ${ROWS_COMPONENT_A10})
+SET(COLS_COMPONENT ${COLS_COMPONENT_A10})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+ MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+ MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+ MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+ SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+ SET(FIXED_ITERATIONS ${FIXED_ITERATIONS_S10})
+ SET(SEED ${SEED_S10})
+ SET(ROWS_COMPONENT ${ROWS_COMPONENT_S10})
+ SET(COLS_COMPONENT ${COLS_COMPONENT_S10})
+
+ELSE()
+ MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS -fintelfpga -c -DFIXED_ITERATIONS=${FIXED_ITERATIONS} -DROWS_COMPONENT=${ROWS_COMPONENT} -DCOLS_COMPONENT=${COLS_COMPONENT})
+
+# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation
+separate_arguments(USER_HARDWARE_FLAGS)
+set(HARDWARE_LINK_FLAGS -fintelfpga -Xshardware -Xsclock=330MHz -Xsfp-relaxed -Xsparallel=2 -Xsseed=${SEED} -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS} -DFIXED_ITERATIONS=${FIXED_ITERATIONS} -DROWS_COMPONENT=${ROWS_COMPONENT} -DCOLS_COMPONENT=${COLS_COMPONENT})
+set(FINAL_LINK_FLAGS -fintelfpga -DFIXED_ITERATIONS=${FIXED_ITERATIONS} -DROWS_COMPONENT=${ROWS_COMPONENT} -DCOLS_COMPONENT=${COLS_COMPONENT})
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR -DFIXED_ITERATIONS=${FIXED_ITERATIONS} -DROWS_COMPONENT=${ROWS_COMPONENT} -DCOLS_COMPONENT=${COLS_COMPONENT}")
+set(EMULATOR_LINK_FLAGS -fintelfpga )
+
+# fpga emulator
+if(WIN32)
+ set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+ add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+ separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+ add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} ${CMAKE_CURRENT_SOURCE_DIR}/${HOST_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+ DEPENDS ${DEVICE_SOURCE_FILE} ${HOST_SOURCE_FILE})
+else()
+ add_executable(${EMULATOR_TARGET} ${DEVICE_SOURCE_FILE} ${HOST_SOURCE_FILE})
+ add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+ set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+ set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpga
+if(WIN32)
+ add_custom_target(fpga
+ COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+ add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+ set(DEVICE_FPGA_OBJ "qrd_fpga.o")
+ set(DEVICE_IMAGE_FPGA_OBJ "qrd_fpga.a")
+ set(HOST_FPGA_OBJ "qrd_host.o")
+
+ add_custom_command(OUTPUT ${DEVICE_FPGA_OBJ}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${DEVICE_FPGA_OBJ}
+ DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE})
+
+ add_custom_command(OUTPUT ${HOST_FPGA_OBJ}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${HOST_SOURCE_FILE} -o ${HOST_FPGA_OBJ}
+ DEPENDS ${HOST_SOURCE_FILE})
+
+ add_custom_command(OUTPUT ${DEVICE_IMAGE_FPGA_OBJ}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link=image ${DEVICE_FPGA_OBJ} -o ${DEVICE_IMAGE_FPGA_OBJ}
+ DEPENDS ${DEVICE_FPGA_OBJ})
+
+ add_custom_command(OUTPUT ${FPGA_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${FINAL_LINK_FLAGS} ${HOST_FPGA_OBJ} ${DEVICE_IMAGE_FPGA_OBJ} -o ${CMAKE_BINARY_DIR}/${FPGA_TARGET}
+ DEPENDS ${DEVICE_IMAGE_FPGA_OBJ} ${HOST_FPGA_OBJ})
+endif()
+
+# fpga report
+if(WIN32)
+ add_custom_target(report DEPENDS ${REPORTS_TARGET} )
+
+ separate_arguments(WIN_FLAGS WINDOWS_COMMAND)
+ add_custom_command(OUTPUT ${REPORTS_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${WIN_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET}
+ DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE})
+
+else()
+ add_custom_target(report DEPENDS ${REPORTS_TARGET} )
+
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} ${DEVICE_SOURCE_FILE} COPYONLY)
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_HEADER_FILE} ${DEVICE_HEADER_FILE} COPYONLY)
+
+ add_custom_command(OUTPUT ${REPORTS_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${DEVICE_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET}
+ DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE})
+endif()
+
+# run
+add_custom_target(run
+ COMMAND ../${TARGET_NAME}.fpga_emu
+ DEPENDS ${TARGET_NAME}.fpga_emu)
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/build.ninja b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/build.ninja
new file mode 100755
index 0000000000..619923b204
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/build.ninja
@@ -0,0 +1,32 @@
+device_source_file = qrd.cpp
+device_header_file = qrd.hpp
+host_source_file = qrd_demo.cpp
+target_name = qrd
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware -Xsclock=330MHz -Xsfp-relaxed -Xsparallel=2
+emulator_flags = -fintelfpga -DFPGA_EMULATOR -Xsfast-emulator
+
+rule build_fpga_emu
+ command = dpcpp /GX ${emulator_flags} ${device_source_file} ${host_source_file} ${design_flags} -DFIXED_ITERATIONS=64 -DROWS_COMPONENT=128 -DCOLS_COMPONENT=128 -o $out
+
+rule gen_report
+ command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 ${device_source_file} ${host_source_file} -DFIXED_ITERATIONS=64 -DROWS_COMPONENT=128 -DCOLS_COMPONENT=128 -Xsseed=5 -fsycl-link -o $out
+
+rule gen_report_s10_pac
+ command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 ${device_source_file} ${host_source_file} -DFIXED_ITERATIONS=105 -DROWS_COMPONENT=256 -DCOLS_COMPONENT=256 -Xsseed=1 -fsycl-link -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.cpp
new file mode 100755
index 0000000000..a6d973cbaa
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.cpp
@@ -0,0 +1,318 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#include
+#include
+#include
+#include
+#include
+
+#include "qrd.hpp"
+
+using std::vector;
+using namespace sycl;
+
+template
+struct Unroller {
+ template
+ static void Step(const Action &action) {
+ action(begin);
+ Unroller::Step(action);
+ }
+};
+
+template
+struct Unroller {
+ template
+ static void Step(const Action &action) {}
+};
+
+struct MyComplex {
+ float xx;
+ float yy;
+ MyComplex(float x, float y) {
+ xx = x;
+ yy = y;
+ }
+ MyComplex() {}
+ const MyComplex operator+(const MyComplex other) const {
+ return MyComplex(xx + other.xx, yy + other.yy);
+ }
+};
+
+MyComplex MulMycomplex(MyComplex a, MyComplex b) {
+ MyComplex c;
+ c.xx = a.xx * b.xx + a.yy * b.yy;
+ c.yy = a.yy * b.xx - a.xx * b.yy;
+ return c;
+}
+
+// Forward declare the kernel name
+// (This will become unnecessary in a future compiler version.)
+class QRD;
+
+void QRDecomposition(vector &in_matrix, vector &out_matrix, queue &q,
+ size_t matrices, size_t reps) {
+ // Number of complex elements in the matrix
+ constexpr int kNumComplexElements = COLS_COMPONENT * ROWS_COMPONENT;
+
+ // Sizes of allocated memories for input and output matrix
+ constexpr int kInputMatrixSize = kNumComplexElements * 2;
+ constexpr int kOutputMatrixSize =
+ (ROWS_COMPONENT + 1) * COLS_COMPONENT * 3;
+
+ // Constants related to the memory configuration of the kernel's local
+ // memories
+ // We want 4 complex elements (2 floating point values) in each memory bank
+ constexpr int kNumElementsPerBank = 4;
+ // Set the bankwidth in bytes
+ constexpr int kBankwidth = kNumElementsPerBank * 8;
+ constexpr int kNumBanks = ROWS_COMPONENT / kNumElementsPerBank;
+
+ constexpr int kLoadIter = kNumComplexElements / kNumElementsPerBank;
+ constexpr int kStoreIter = kNumComplexElements / kNumElementsPerBank;
+ constexpr short kNumBuffers = 4;
+
+ // We will process 'chunk' number of matrices in each run of the kernel
+ short chunk = 2048;
+ if (matrices % chunk) {
+ chunk = 1;
+ }
+
+ // Create buffers and allocate space for them.
+ buffer *input_matrix[kNumBuffers], *output_matrix[kNumBuffers];
+ for (short i = 0; i < kNumBuffers; i++) {
+ input_matrix[i] = new buffer(kInputMatrixSize * chunk);
+ output_matrix[i] = new buffer(kOutputMatrixSize * chunk);
+ }
+
+ for (size_t r = 0; r < reps; r++) {
+ for (size_t b = 0, it = 0; it < matrices;
+ it += chunk, b = (b + 1) % kNumBuffers) {
+ const float *kPtr = in_matrix.data() + kInputMatrixSize * it;
+ float *kPtr2 = out_matrix.data() + kOutputMatrixSize * it;
+ int matrices = chunk;
+
+ q.submit([&](handler &h) {
+ auto in_matrix2 =
+ input_matrix[b]->get_access(h);
+ h.copy(kPtr, in_matrix2);
+ });
+
+ q.submit([&](handler &h) {
+ auto in_matrix = input_matrix[b]->get_access(h);
+ auto out_matrix =
+ output_matrix[b]->get_access(h);
+ auto out_matrix2 = out_matrix;
+ h.single_task([=]() [[intel::kernel_args_restrict]] {
+ for (int l = 0; l < matrices; l++) {
+ [[intelfpga::bankwidth(kBankwidth),
+ intelfpga::numbanks(kNumBanks)]] struct {
+ MyComplex d[ROWS_COMPONENT];
+ } a_matrix[COLS_COMPONENT], ap_matrix[COLS_COMPONENT],
+ aload_matrix[COLS_COMPONENT];
+
+ MyComplex vector_ai[ROWS_COMPONENT], vector_ti[ROWS_COMPONENT];
+ MyComplex s_or_i[COLS_COMPONENT];
+
+ // Copy data from DDR memory to on-chip memory.
+ int idx = l * kNumComplexElements / kNumElementsPerBank;
+ for (short li = 0; li < kLoadIter; li++) {
+ MyComplex tmp[kNumElementsPerBank];
+ Unroller<0, kNumElementsPerBank>::Step([&](int k) {
+ tmp[k].xx = in_matrix[idx * 2 * kNumElementsPerBank + k * 2];
+ tmp[k].yy =
+ in_matrix[idx * 2 * kNumElementsPerBank + k * 2 + 1];
+ });
+
+ idx++;
+ int jtmp = li % (kNumBanks);
+
+ Unroller<0, kNumBanks>::Step([&](int k) {
+ Unroller<0, kNumElementsPerBank>::Step([&](int t) {
+ if (jtmp == k) {
+ aload_matrix[li / (kNumBanks)]
+ .d[k * kNumElementsPerBank + t].xx = tmp[t].xx;
+ aload_matrix[li / (kNumBanks)]
+ .d[k * kNumElementsPerBank + t].yy = tmp[t].yy;
+ }
+
+ // Delay data signals to create a vine-based data distribution
+ // to lower signal fanout.
+ tmp[t].xx = intel::fpga_reg(tmp[t].xx);
+ tmp[t].yy = intel::fpga_reg(tmp[t].yy);
+ });
+
+ jtmp = intel::fpga_reg(jtmp);
+ });
+ }
+
+ float p_ii_x, i_r_ii_x;
+ short i = -1;
+ short j = N_VALUE - FIXED_ITERATIONS < 0
+ ? (N_VALUE - FIXED_ITERATIONS)
+ : 0;
+ int qr_idx = l * kOutputMatrixSize / 2;
+
+ [[intelfpga::ii(1)]] [[intelfpga::ivdep(FIXED_ITERATIONS)]]
+ for (int s = 0; s < ITERATIONS; s++) {
+ MyComplex vector_t[ROWS_COMPONENT];
+ MyComplex sori[kNumBanks];
+
+ bool j_eq_i[kNumBanks], i_gt_0[kNumBanks],
+ i_ge_0_j_eq_i[kNumBanks], j_eq_i_plus_1[kNumBanks],
+ i_lt_0[kNumBanks];
+
+ Unroller<0, kNumBanks>::Step([&](int k) {
+ i_gt_0[k] = intel::fpga_reg(i > 0);
+ i_lt_0[k] = intel::fpga_reg(i < 0);
+ j_eq_i[k] = intel::fpga_reg(j == i);
+ i_ge_0_j_eq_i[k] = intel::fpga_reg(i >= 0 && j >= i);
+ j_eq_i_plus_1[k] = intel::fpga_reg(j == i + 1);
+ sori[k].xx = intel::fpga_reg(s_or_i[j].xx);
+ sori[k].yy = intel::fpga_reg(s_or_i[j].yy);
+ });
+
+ Unroller<0, ROWS_COMPONENT>::Step([&](int k) {
+ vector_t[k].xx = aload_matrix[j].d[k].xx;
+ vector_t[k].yy = aload_matrix[j].d[k].yy;
+ if (i_gt_0[k / kNumElementsPerBank]) {
+ vector_t[k].xx = a_matrix[j].d[k].xx;
+ vector_t[k].yy = a_matrix[j].d[k].yy;
+ }
+ if (j_eq_i[k / kNumElementsPerBank]) {
+ vector_ai[k].xx = vector_t[k].xx;
+ vector_ai[k].yy = vector_t[k].yy;
+ }
+ });
+
+ Unroller<0, ROWS_COMPONENT>::Step([&](int k) {
+ vector_t[k] =
+ MulMycomplex(vector_ai[k],
+ i_lt_0[k / kNumElementsPerBank]
+ ? MyComplex(0.0, 0.0)
+ : sori[k / kNumElementsPerBank]) +
+ (j_eq_i[k / kNumElementsPerBank] ? MyComplex(0.0, 0.0)
+ : vector_t[k]);
+ if (i_ge_0_j_eq_i[k / kNumElementsPerBank]) {
+ ap_matrix[j].d[k].xx = a_matrix[j].d[k].xx =
+ vector_t[k].xx;
+ ap_matrix[j].d[k].yy = a_matrix[j].d[k].yy =
+ vector_t[k].yy;
+ }
+ if (j_eq_i_plus_1[k / kNumElementsPerBank]) {
+ vector_ti[k] = vector_t[k];
+ }
+ });
+
+ MyComplex p_ij = MyComplex(0, 0);
+ Unroller<0, ROWS_COMPONENT>::Step([&](int k) {
+ p_ij = p_ij + MulMycomplex(vector_t[k], vector_ti[k]);
+ });
+
+ if (j == i + 1) {
+ p_ii_x = p_ij.xx;
+ i_r_ii_x = rsqrt(p_ij.xx);
+ }
+
+ MyComplex s_ij =
+ MyComplex(0.0f - (p_ij.xx) / p_ii_x, p_ij.yy / p_ii_x);
+
+ if (j >= 0) {
+ s_or_i[j] = MyComplex(j == i + 1 ? i_r_ii_x : s_ij.xx,
+ j == i + 1 ? 0.0f : s_ij.yy);
+ }
+
+ MyComplex r_ii = j == i + 1 ? MyComplex(sycl::sqrt(p_ii_x), 0.0)
+ : MyComplex(i_r_ii_x * p_ij.xx,
+ i_r_ii_x * p_ij.yy);
+
+ if (j >= i + 1 && i + 1 < N_VALUE) {
+ out_matrix[qr_idx * 2] = r_ii.xx;
+ out_matrix[qr_idx * 2 + 1] = r_ii.yy;
+ qr_idx++;
+ }
+
+ if (j == N_VALUE - 1) {
+ j = ((N_VALUE - FIXED_ITERATIONS) > i)
+ ? (i + 1)
+ : (N_VALUE - FIXED_ITERATIONS);
+ i++;
+ } else {
+ j++;
+ }
+ }
+
+ qr_idx /= 4;
+ for (short si = 0; si < kStoreIter; si++) {
+ int desired = si % (kNumBanks);
+ bool get[kNumBanks];
+ Unroller<0, kNumBanks>::Step([&](int k) {
+ get[k] = desired == k;
+ desired = intel::fpga_reg(desired);
+ });
+
+ MyComplex tmp[kNumElementsPerBank];
+ Unroller<0, kNumBanks>::Step([&](int t) {
+ Unroller<0, kNumElementsPerBank>::Step([&](int k) {
+ tmp[k].xx = get[t] ? ap_matrix[si / (kNumBanks)]
+ .d[t * kNumElementsPerBank + k]
+ .xx
+ : intel::fpga_reg(tmp[k].xx);
+ tmp[k].yy = get[t] ? ap_matrix[si / (kNumBanks)]
+ .d[t * kNumElementsPerBank + k]
+ .yy
+ : intel::fpga_reg(tmp[k].yy);
+ });
+ });
+
+ Unroller<0, 4>::Step([&](int k) {
+ out_matrix2[qr_idx * 2 * kNumElementsPerBank + k * 2] =
+ tmp[k].xx;
+ out_matrix2[qr_idx * 2 * kNumElementsPerBank + k * 2 + 1] =
+ tmp[k].yy;
+ });
+
+ qr_idx++;
+ }
+ }
+ });
+ });
+
+ q.submit([&](handler &h) {
+ auto final_matrix = output_matrix[b]->get_access(h);
+ h.copy(final_matrix, kPtr2);
+ });
+ }
+ }
+
+ for (short b = 0; b < kNumBuffers; b++) {
+ delete input_matrix[b];
+ delete output_matrix[b];
+ }
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.hpp
new file mode 100755
index 0000000000..4ada530ea7
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.hpp
@@ -0,0 +1,43 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+// The values for FIXED_ITERATIONS, ROWS_COMPONENT and COLS_COMPONENT will be
+// supplied by the build system (cmake/build.ninja)
+
+// Architecture/Design Parameters used to implement the triagular loop
+// structure of the design. See the tutorial on triangular loop optimization
+// for more details.
+#define N_VALUE COLS_COMPONENT
+
+#define M_MINUS_COLS \
+ (FIXED_ITERATIONS > COLS_COMPONENT ? FIXED_ITERATIONS - COLS_COMPONENT : 0)
+
+#define ITERATIONS \
+ (COLS_COMPONENT + M_MINUS_COLS + (COLS_COMPONENT + 1) * COLS_COMPONENT / 2 + \
+ FIXED_ITERATIONS * (FIXED_ITERATIONS - 1) / 2 - \
+ M_MINUS_COLS * (M_MINUS_COLS - 1) / 2)
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd_demo.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd_demo.cpp
new file mode 100755
index 0000000000..4bee78a672
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd_demo.cpp
@@ -0,0 +1,233 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#include
+
+#include
+#include
+#include
+#include
+
+#include "dpc_common.hpp"
+#include "qrd.hpp"
+
+using namespace std;
+using namespace std::chrono;
+using namespace sycl;
+
+// Run the modified Gram-Schmidt QR Decomposition algorithm on the given
+// matrices. The function will do the following:
+// 1. Transfer the input matrices to the FPGA.
+// 2. Run the algorithm.
+// 3. Copy the output data back to host device.
+// The above process is carried out 'reps' number of times.
+void QRDecomposition(vector &in_matrix, vector &out_matrix, queue &q,
+ size_t matrices, size_t reps);
+
+int main(int argc, char *argv[]) {
+ constexpr size_t kRandomSeed = 1138;
+ constexpr size_t kRandomMin = 1;
+ constexpr size_t kRandomMax = 10;
+
+ size_t matrices = argc > 1 ? atoi(argv[1]) : 1;
+ if (matrices < 1) {
+ cout << "Must run at least 1 matrix\n";
+ return 1;
+ }
+
+ try {
+#if defined(FPGA_EMULATOR)
+ intel::fpga_emulator_selector device_selector;
+#else
+ intel::fpga_selector device_selector;
+#endif
+
+ queue q = queue(device_selector, dpc_common::exception_handler);
+ device device = q.get_device();
+ cout << "Device name: " << device.get_info().c_str()
+ << "\n";
+
+ vector a_matrix;
+ vector qr_matrix;
+
+ constexpr size_t kAMatrixSizeFactor = ROWS_COMPONENT * COLS_COMPONENT * 2;
+ constexpr size_t kQRMatrixSizeFactor =
+ (ROWS_COMPONENT + 1) * COLS_COMPONENT * 3;
+ constexpr size_t kIndexAccessFactor = 2;
+
+ a_matrix.resize(matrices * kAMatrixSizeFactor);
+ qr_matrix.resize(matrices * kQRMatrixSizeFactor);
+
+ // For output-postprocessing
+ float q_matrix[ROWS_COMPONENT][COLS_COMPONENT][2];
+ float r_matrix[COLS_COMPONENT][COLS_COMPONENT][2];
+
+ cout << "Generating " << matrices << " random matri"
+ << ((matrices == 1) ? "x " : "ces ") << "\n";
+
+ srand(kRandomSeed);
+
+ for (size_t i = 0; i < matrices; i++) {
+ for (size_t row = 0; row < ROWS_COMPONENT; row++) {
+ for (size_t col = 0; col < COLS_COMPONENT; col++) {
+ int random_val = rand();
+ float random_double =
+ random_val % (kRandomMax - kRandomMin) + kRandomMin;
+ a_matrix[i * kAMatrixSizeFactor +
+ col * ROWS_COMPONENT * kIndexAccessFactor +
+ row * kIndexAccessFactor] = random_double;
+ int random_val_imag = rand();
+ random_double =
+ random_val_imag % (kRandomMax - kRandomMin) + kRandomMin;
+ a_matrix[i * kAMatrixSizeFactor +
+ col * ROWS_COMPONENT * kIndexAccessFactor +
+ row * kIndexAccessFactor + 1] = random_double;
+ }
+ }
+ }
+
+ QRDecomposition(a_matrix, qr_matrix, q, 1, 1); // Accelerator warmup
+
+#if defined(FPGA_EMULATOR)
+ size_t reps = 2;
+#else
+ size_t reps = 32;
+#endif
+ cout << "Running QR decomposition of " << matrices << " matri"
+ << ((matrices == 1) ? "x " : "ces ")
+ << ((reps > 1) ? "repeatedly" : "") << "\n";
+
+ high_resolution_clock::time_point start_time = high_resolution_clock::now();
+ QRDecomposition(a_matrix, qr_matrix, q, matrices, reps);
+ high_resolution_clock::time_point end_time = high_resolution_clock::now();
+ duration diff = end_time - start_time;
+ q.throw_asynchronous();
+
+ cout << " Total duration: " << diff.count() << " s"
+ << "\n";
+ cout << "Throughput: " << reps * matrices / diff.count() / 1000
+ << "k matrices/s"
+ << "\n";
+
+ list to_check;
+ // We will check at least matrix 0
+ to_check.push_back(0);
+ // Spot check the last and the middle one
+ if (matrices > 2) to_check.push_back(matrices / 2);
+ if (matrices > 1) to_check.push_back(matrices - 1);
+
+ cout << "Verifying results on matrix";
+
+ for (size_t matrix : to_check) {
+ cout << " " << matrix;
+ size_t idx = 0;
+ for (size_t i = 0; i < COLS_COMPONENT; i++) {
+ for (size_t j = 0; j < COLS_COMPONENT; j++) {
+ if (j < i)
+ r_matrix[i][j][0] = r_matrix[i][j][1] = 0;
+ else {
+ r_matrix[i][j][0] = qr_matrix[matrix * kQRMatrixSizeFactor + idx++];
+ r_matrix[i][j][1] = qr_matrix[matrix * kQRMatrixSizeFactor + idx++];
+ }
+ }
+ }
+
+ for (size_t j = 0; j < COLS_COMPONENT; j++) {
+ for (size_t i = 0; i < ROWS_COMPONENT; i++) {
+ q_matrix[i][j][0] = qr_matrix[matrix * kQRMatrixSizeFactor + idx++];
+ q_matrix[i][j][1] = qr_matrix[matrix * kQRMatrixSizeFactor + idx++];
+ }
+ }
+
+ float acc_real = 0;
+ float acc_imag = 0;
+ float v_matrix[ROWS_COMPONENT][COLS_COMPONENT][2] = {{{0}}};
+ for (size_t i = 0; i < ROWS_COMPONENT; i++) {
+ for (size_t j = 0; j < COLS_COMPONENT; j++) {
+ acc_real = 0;
+ acc_imag = 0;
+ for (size_t k = 0; k < COLS_COMPONENT; k++) {
+ acc_real += q_matrix[i][k][0] * r_matrix[k][j][0] -
+ q_matrix[i][k][1] * r_matrix[k][j][1];
+ acc_imag += q_matrix[i][k][0] * r_matrix[k][j][1] +
+ q_matrix[i][k][1] * r_matrix[k][j][0];
+ }
+ v_matrix[i][j][0] = acc_real;
+ v_matrix[i][j][1] = acc_imag;
+ }
+ }
+
+ float error = 0;
+ size_t count = 0;
+ constexpr float kErrorThreshold = 1e-4;
+ for (size_t row = 0; row < ROWS_COMPONENT; row++) {
+ for (size_t col = 0; col < COLS_COMPONENT; col++) {
+ if (std::isnan(v_matrix[row][col][0]) ||
+ std::isnan(v_matrix[row][col][1])) {
+ count++;
+ }
+ float real = v_matrix[row][col][0] -
+ a_matrix[matrix * kAMatrixSizeFactor +
+ col * ROWS_COMPONENT * kIndexAccessFactor +
+ row * kIndexAccessFactor];
+ float imag = v_matrix[row][col][1] -
+ a_matrix[matrix * kAMatrixSizeFactor +
+ col * ROWS_COMPONENT * kIndexAccessFactor +
+ row * kIndexAccessFactor + 1];
+ if (sqrt(real * real + imag * imag) >= kErrorThreshold) {
+ error += sqrt(real * real + imag * imag);
+ count++;
+ }
+ }
+ }
+
+ if (count > 0) {
+ cout << "\nFAILED\n";
+ cout << "\n"
+ << "!!!!!!!!!!!!!! Error = " << error << " in " << count << " / "
+ << ROWS_COMPONENT * COLS_COMPONENT << "\n";
+ return 1;
+ }
+ }
+
+ cout << "\nPASSED\n";
+ return 0;
+
+ } catch (sycl::exception const &e) {
+ cout << "Caught a synchronous SYCL exception: " << e.what() << "\n";
+ cout << " If you are targeting an FPGA hardware, "
+ "ensure that your system is plugged to an FPGA board that is "
+ "set up correctly"
+ << "\n";
+ cout << " If you are targeting the FPGA emulator, compile with "
+ "-DFPGA_EMULATOR"
+ << "\n";
+
+ terminate();
+ }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/CMakeLists.txt
new file mode 100755
index 0000000000..5c0cea463c
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(DoubleBuffering)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/README.md b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/README.md
new file mode 100755
index 0000000000..31b7e3df37
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/README.md
@@ -0,0 +1,223 @@
+# Double Buffering to Overlap Kernel Execution with Buffer Transfers and Host Processing
+This FPGA tutorial demonstrates how to parallelize host-side processing and buffer transfers between host and device with kernel execution, which can improve overall application performance.
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming.
+
+| Optimized for | Description
+--- |---
+| OS | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit
+| What you will learn | How and when to implement the double buffering optimization technique
+| Time to complete | 30 minutes
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+In an application where the FPGA kernel is executed multiple times, the host must perform the following processing and buffer transfers before each kernel invocation.
+1. The output data from the *previous* invocation must be transferred from device to host and then processed by the host. Examples of this processing include:
+ * Copying the data to another location
+ * Rearranging the data
+ * Verifying it in some way
+2. The input data for the *next* invocation must be processed by the host and then transferred to the device. Examples of this processing include:
+ * Copying the data from another location
+ * Rearranging the data for kernel consumption
+ * Generating the data in some way
+
+Without double buffering, host processing and buffer transfers occur *between* kernel executions. Therefore, there is a gap in time between kernel executions, which you can refer to as kernel *downtime* (see diagram below). If these operations overlap with kernel execution, the kernels can execute back-to-back with minimal downtime, thereby increasing overall application performance.
+
+### Determining When is Double Buffering Possible
+
+Let's define the required variables:
+* **R** = Time to transfer the kernel's output buffer from device to host.
+* **Op** = Host-side processing time of kernel output data (*output processing*)
+* **Ip** = Host-side processing time for kernel input data (*input processing*)
+* **W** = Time to transfer the kernel's input buffer from host to device.
+* **K** = Kernel execution time
+
+
+
+In general, **R**, **Op**, **Ip**, and **W** operations must all complete before the next kernel is launched. To maximize performance, while one kernel is executing on the device, these operations should execute simultaneously on the host and operate on a second set of buffer locations. They should complete before the current kernel completes, thus allowing the next kernel to be launched immediately with no downtime. In general, to maximize performance, the host must launch a new kernel every **K**.
+
+This leads to the following constraint:
+
+```c++
+R + Op + Ip + W <= K, in order to minimize kernel downtime.
+```
+If the above constraint is not satisfied, a performance improvement may still be observed because *some* overlap (perhaps not complete overlap) is still possible. Further improvement is possible by extending the double buffering concept to N-way buffering (see the corresponding tutorial).
+
+### Measuring the Impact of Double Buffering
+
+You must get a sense of the kernel downtime to identify the degree to which this technique can help improve performance.
+
+This can be done by querying the total kernel execution time from the runtime and comparing it to the overall application execution time. In an application where kernels execute with minimal downtime, these two numbers will be close. However, if kernels have a lot of downtime, overall execution time will notably exceed kernel execution time. The tutorial code exemplifies how to do this.
+
+### Tutorial Implementation Notes
+
+The basic idea is to:
+1. Perform the input processing for the first two kernel executions and queue them both.
+2. Immediately call the `process_output()` method (automatically blocked by the SYCL* runtime) on the first kernel completing because of the implicit data dependency.
+3. When the first kernel completes, the second kernel begins executing immediately because it was already queued.
+4. While the second kernel runs, the host processes the output data from the first kernel and prepares the input data for the third kernel.
+5. As long as the above operations complete before the second kernel completes, the third kernel is queued early enough to allow it to be launched immediately after the second kernel.
+
+The process then repeats.
+
+The impact of double buffering on the total runtime of the tutorial program will be analyzed in the "Running the Sample" section below.
+
+## Key Concepts
+* The double buffering optimization technique
+* Determining when double buffering is beneficial
+* How to measure the impact of double buffering
+
+## License
+This code sample is licensed under MIT license.
+
+
+## Building the `double_buffering` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+ ```
+ mkdir build
+ cd build
+ ```
+ To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:
+ ```
+ cmake ..
+ ```
+ Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+ ```
+ cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+ ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ make fpga_emu
+ ```
+ * Generate the optimization report:
+ ```
+ make report
+ ```
+ * Compile for FPGA hardware (longer compile time, targets FPGA device):
+ ```
+ make fpga
+ ```
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead.
+
+1. Enter the source file directory.
+ ```
+ cd src
+ ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ ninja fpga_emu
+ ```
+
+ * Generate the optimization report:
+
+ ```
+ ninja report
+ ```
+ If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+ ```
+ ninja report_s10_pac
+ ```
+ * Compiling for FPGA hardware is not yet supported on Windows.
+
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Examining the Reports
+Locate `report.html` in the `double_buffering_report.prj/reports/` or `double_buffering_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+Note that because the optimization described in this tutorial takes place at the *runtime* level, the FPGA compiler report will not show a difference between the optimized and unoptimized cases.
+
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+ ```
+ ./double_buffering.fpga_emu (Linux)
+ double_buffering.fpga_emu.exe (Windows)
+ ```
+2. Run the sample on the FPGA device:
+ ```
+ ./double_buffering.fpga (Linux)
+ ```
+
+### Example of Output
+
+```
+Platform name: Intel(R) FPGA SDK for OpenCL(TM)
+Device name: pac_a10 : Intel PAC Platform (pac_ee00000)
+
+
+Executing kernel 100 times in each round.
+
+*** Beginning execution, without double buffering
+Launching kernel #0
+Launching kernel #10
+Launching kernel #20
+Launching kernel #30
+Launching kernel #40
+Launching kernel #50
+Launching kernel #60
+Launching kernel #70
+Launching kernel #80
+Launching kernel #90
+
+Overall execution time without double buffering = 29742 ms
+Total kernel-only execution time without double buffering = 17856 ms
+Throughput = 35.255249 MB/s
+
+
+*** Beginning execution, with double buffering.
+Launching kernel #0
+Launching kernel #10
+Launching kernel #20
+Launching kernel #30
+Launching kernel #40
+Launching kernel #50
+Launching kernel #60
+Launching kernel #70
+Launching kernel #80
+Launching kernel #90
+
+Overall execution time with double buffering = 17967 ms
+Total kernel-only execution time with double buffering = 17869 ms
+Throughput = 58.35976 MB/s
+
+
+Verification PASSED
+```
+
+### Discussion of Results
+
+A test compile of this tutorial design achieved a maximum frequency (fMAX) of approximately 340 MHz on the Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA. The results with and without double buffering are shown in the following table:
+
+Configuration | Overall Execution Time (ms) | Total Kernel Execution time (ms)
+-|-|-
+Without double buffering | 23462 | 15187
+With double buffering | 15145 | 15034
+
+In both runs, the total kernel execution time is similar, as expected. However, without double buffering, the overall execution time notably exceeds the total kernel execution time, implying there is downtime between kernel executions. With double buffering, the overall execution time is close to the the total kernel execution time.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.sln b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.sln
new file mode 100755
index 0000000000..4108b65da8
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.sln
@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "double_buffering", "double_buffering.vcxproj", "{6910A54A-BFE5-462F-9F3B-B84F62C5ADD1}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|x64 = Debug|x64
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {6910A54A-BFE5-462F-9F3B-B84F62C5ADD1}.Debug|x64.ActiveCfg = Debug|x64
+ {6910A54A-BFE5-462F-9F3B-B84F62C5ADD1}.Debug|x64.Build.0 = Debug|x64
+ {6910A54A-BFE5-462F-9F3B-B84F62C5ADD1}.Release|x64.ActiveCfg = Release|x64
+ {6910A54A-BFE5-462F-9F3B-B84F62C5ADD1}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ SolutionGuid = {1878B8F8-3C90-4CB5-9A71-66501FA4A3BA}
+ EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.vcxproj
new file mode 100755
index 0000000000..b7ee382578
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.vcxproj
@@ -0,0 +1,160 @@
+
+
+
+
+ Debug
+ x64
+
+
+ Release
+ x64
+
+
+
+
+
+
+
+
+
+ 15.0
+ {6910a54a-bfe5-462f-9f3b-b84f62c5add1}
+ Win32Proj
+ double_buffering
+ $(WindowsSDKVersion.Replace("\",""))
+
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true
+
+
+ true
+
+
+ false
+
+
+ false
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR %(AdditionalOptions)
+ $(IntDir)double_buffering.obj
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR %(AdditionalOptions)
+ $(IntDir)double_buffering.obj
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/downtime.png b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/downtime.png
new file mode 100755
index 0000000000..2a306929bc
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/downtime.png differ
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/sample.json
new file mode 100755
index 0000000000..b10e6e185a
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/sample.json
@@ -0,0 +1,51 @@
+{
+ "guid": "B210B44F-FB86-4F42-BA4A-9980805350FF",
+ "name": "Overlapping Kernel Execution with Buffer Transfers and Host Processing through Double Buffering",
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+ "description": "FPGA tutorial design to demonstrate overlapping kernel execution with buffer transfers and host-processing to improve system performance",
+ "toolchain": ["dpcpp"],
+ "os": ["linux", "windows"],
+ "targetDevice": ["FPGA"],
+ "builder": ["ide", "cmake"],
+ "languages": [{"cpp":{}}],
+ "ciTests": {
+ "linux": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make fpga_emu",
+ "./double_buffering.fpga_emu"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make report"
+ ]
+ }
+ ],
+ "windows": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "cd src",
+ "ninja fpga_emu",
+ "double_buffering.fpga_emu.exe"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "cd src",
+ "ninja report"
+ ]
+ }
+ ]
+ }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/CMakeLists.txt
new file mode 100755
index 0000000000..f918135042
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/CMakeLists.txt
@@ -0,0 +1,89 @@
+set(SOURCE_FILE double_buffering.cpp)
+set(TARGET_NAME double_buffering)
+
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+ MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+ MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+ MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+ SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+ MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga")
+
+# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+# fpga emulator
+if(WIN32)
+ set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+ add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+ separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+ add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+ DEPENDS ${SOURCE_FILE})
+else()
+ add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+ add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+ set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+ set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+
+# fpga
+if(WIN32)
+ add_custom_target(fpga
+ COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+ add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+ add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+ set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS})
+ set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS})
+endif()
+
+# generate report
+if(WIN32)
+ set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+ add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+ separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+ add_custom_command(OUTPUT ${DEVICE_OBJ_FILE}
+ COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+ DEPENDS ${SOURCE_FILE})
+
+else()
+ set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+ add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+
+ separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+ add_custom_command(OUTPUT ${DEVICE_OBJ_FILE}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+ DEPENDS ${SOURCE_FILE})
+endif()
+
+# run
+add_custom_target(run
+ COMMAND ../${TARGET_NAME}.fpga_emu
+ DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/build.ninja
new file mode 100755
index 0000000000..3e8fdc6126
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/build.ninja
@@ -0,0 +1,30 @@
+source_file = double_buffering.cpp
+target_name = double_buffering
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+ command = dpcpp /GX ${emulator_flags} $in -o $out
+
+rule gen_report
+ command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+ command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/double_buffering.cpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/double_buffering.cpp
new file mode 100755
index 0000000000..556507e307
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/double_buffering.cpp
@@ -0,0 +1,349 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include
+#include
+#include
+#include
+
+#include "dpc_common.hpp"
+
+using namespace sycl;
+
+// kTimes = # times to execute the kernel. kTimes must be >= 2
+// kSize = # of floats to process on each kernel execution.
+// run less in emulation to avoid high run time
+#if defined(FPGA_EMULATOR)
+constexpr int kTimes = 20;
+constexpr int kSize = 4096;
+#else
+constexpr int kTimes = 100;
+constexpr int kSize = 2621440;
+#endif
+
+// Kernel executes a power function (base^kPow). Must be
+// >= 2. Can increase this to increase kernel execution
+// time, but ProcessOutput() time will also increase.
+constexpr int kPow = 20;
+
+// Number of iterations through the main loop
+constexpr int kNumRuns = 2;
+
+bool pass = true;
+
+class SimpleVpow;
+
+/* Kernel function.
+ Performs buffer_b[i] = buffer_a[i] ** pow
+ Only supports pow >= 2.
+ This kernel is not meant to be an optimal implementation of the power
+ operation -- it's just a sample kernel for this tutorial whose execution time
+ is easily controlled via the pow parameter. SYCL buffers are created
+ externally and passed in by reference to control (external to this function)
+ when the buffers are destructed. The destructor causes a blocking buffer
+ transfer from device to host and double buffering requires us to not block
+ here (because we need to launch another kernel). So we only want this
+ transfer to occur at the end of overall execution, not at the end of each
+ individual kernel execution.
+*/
+void SimplePow(std::unique_ptr &q, buffer &buffer_a,
+ buffer &buffer_b, event &e) {
+ // Submit to the queue and execute the kernel
+ e = q->submit([&](handler &h) {
+ // Get kernel access to the buffers
+ auto accessor_a = buffer_a.get_access(h);
+ auto accessor_b = buffer_b.get_access(h);
+
+ const int num = kSize;
+ assert(kPow >= 2);
+ const int p = kPow - 1; // Assumes pow >= 2;
+
+ h.single_task([=]() [[intel::kernel_args_restrict]] {
+ for (int j = 0; j < p; j++) {
+ if (j == 0) {
+ for (int i = 0; i < num; i++) {
+ accessor_b[i] = accessor_a[i] * accessor_a[i];
+ }
+ } else {
+ for (int i = 0; i < num; i++) {
+ accessor_b[i] = accessor_b[i] * accessor_a[i];
+ }
+ }
+ }
+ });
+ });
+
+ event update_host_event;
+ update_host_event = q->submit([&](handler &h) {
+ auto accessor_b = buffer_b.get_access(h);
+
+ /*
+ Explicitly instruct the SYCL runtime to copy the kernel's output buffer
+ back to the host upon kernel completion. This is not required for
+ functionality since the buffer access in ProcessOutput() also implicitly
+ instructs the runtime to copy the data back. But it should be noted that
+ this buffer access blocks ProcessOutput() until the kernel is complete
+ and the data is copied. In contrast, update_host() instructs the runtime
+ to perform the copy earlier. This allows ProcessOutput() to optionally
+ perform more useful work *before* making the blocking buffer access. Said
+ another way, this allows ProcessOutput() to potentially perform more work
+ in parallel with the runtime's copy operation.
+ */
+ h.update_host(accessor_b);
+ });
+}
+
+// Returns kernel execution time for a given SYCL event from a queue.
+ulong SyclGetExecTimeNs(event e) {
+ ulong start_time =
+ e.get_profiling_info();
+ ulong end_time =
+ e.get_profiling_info();
+ return (end_time - start_time);
+}
+
+// Local pow function for verifying results
+float MyPow(float input, int pow) {
+ return (pow == 0) ? 1 : input * MyPow(input, pow - 1);
+}
+
+/* Compares kernel output against expected output. Only compares part of the
+ output so that this method completes quickly. This is done
+ intentionally/artificially keep host-processing time shorter than kernel
+ execution time. Grabs kernel output data from its SYCL buffer. Reading from
+ this buffer is a blocking operation that will block on the kernel completing.
+ Queries and records execution time of the kernel that just completed. This
+ is a natural place to do this because ProcessOutput() is blocked on kernel
+ completion.
+*/
+void ProcessOutput(buffer &input_buf,
+ buffer &output_buf, int exec_number, event e,
+ ulong &total_kernel_time_per_slot) {
+ auto input_buf_acc = input_buf.get_access();
+ auto output_buf_acc = output_buf.get_access();
+ int num_errors = 0;
+ int num_errors_to_print = 10;
+ /* The use of update_host() in the kernel function allows for additional
+ host-side operations to be performed here, in parallel with the buffer copy
+ operation from device to host, before the blocking access to the output
+ buffer is made via output_buf_acc[]. To be clear, no real operations are
+ done here and this is just a note that this is the place
+ where you *could* do it. */
+ for (int i = 0; i < kSize / 8; i++) {
+ const bool out_valid = (MyPow(input_buf_acc[i], kPow) != output_buf_acc[i]);
+ if ((num_errors < num_errors_to_print) && out_valid) {
+ if (num_errors == 0) {
+ pass = false;
+ std::cout << "Verification failed on kernel execution # " << exec_number
+ << ". Showing up to " << num_errors_to_print
+ << " mismatches.\n";
+ }
+ std::cout << "Verification failed on kernel execution # " << exec_number
+ << ", at element " << i << ". Expected " << std::fixed
+ << std::setprecision(16) << MyPow(input_buf_acc[i], kPow)
+ << " but got " << output_buf_acc[i] << "\n";
+ num_errors++;
+ }
+ }
+
+ // At this point we know the kernel has completed,
+ // so can query the profiling data.
+ total_kernel_time_per_slot += SyclGetExecTimeNs(e);
+}
+
+/*
+ Generates input data for the next kernel execution. Only fills part of the
+ buffer so that this method completes quickly. This is done
+ intentionally/artificially keep host-processing time shorter than kernel
+ execution time. Writes the data into the associated SYCL buffer. The write
+ will block until the previous kernel execution, that is using this buffer,
+ completes.
+*/
+void ProcessInput(buffer &buf) {
+ // We are generating completely new input data, so can use discard_write()
+ // here to indicate we don't care about the SYCL buffer's current contents.
+ auto buf_acc = buf.get_access();
+
+ // RNG seed
+ auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+
+ // RNG engine
+ std::default_random_engine dre(seed);
+
+ // generate random numbers between 1 and 2
+ std::uniform_real_distribution di(1.0f, 2.0f);
+
+ // Randomly generate a start value and increment from there.
+ // Compared to randomly generating every value, this is done to
+ // speed up this function a bit.
+ float start_val = di(dre);
+
+ for (int i = 0; i < kSize / 8; i++) {
+ buf_acc[i] = start_val;
+ start_val++;
+ }
+}
+
+int main() {
+// Create queue, get platform and device
+#if defined(FPGA_EMULATOR)
+ intel::fpga_emulator_selector device_selector;
+ std::cout << "\nEmulator output does not demonstrate true hardware "
+ "performance. The design may need to run on actual hardware "
+ "to observe the performance benefit of the optimization "
+ "exemplified in this tutorial.\n\n";
+#else
+ intel::fpga_selector device_selector;
+#endif
+
+ try {
+ auto prop_list =
+ property_list{property::queue::enable_profiling()};
+
+ std::unique_ptr q;
+ q.reset(new queue(device_selector, dpc_common::exception_handler, prop_list));
+
+ platform platform = q->get_context().get_platform();
+ device device = q->get_device();
+ std::cout << "Platform name: "
+ << platform.get_info().c_str() << "\n";
+ std::cout << "Device name: "
+ << device.get_info().c_str() << "\n\n\n";
+
+ std::cout << "Executing kernel " << kTimes << " times in each round.\n\n";
+
+ // Create a vector to store the input/output SYCL buffers
+ std::vector> input_buf;
+ std::vector> output_buf;
+
+ // SYCL events for each kernel launch.
+ event sycl_events[2];
+
+ // In nanoseconds. Total execution time of kernels in a given slot.
+ ulong total_kernel_time_per_slot[2];
+
+ // Total execution time of all kernels.
+ ulong total_kernel_time = 0;
+
+ // Allocate vectors to store the host-side copies of the input data
+ // Create and allocate the SYCL buffers
+ for (int i = 0; i < 2; i++) {
+ input_buf.push_back(buffer(range<1>(kSize)));
+ output_buf.push_back(buffer(range<1>(kSize)));
+ }
+
+ /*
+ Main loop. This loop runs twice to show the performance difference without
+ and with double buffering.
+ */
+ for (int i = 0; i < kNumRuns; i++) {
+ for (int i = 0; i < 2; i++) {
+ total_kernel_time_per_slot[i] = 0; // Initialize timers to zero.
+ }
+
+ switch (i) {
+ case 0: {
+ std::cout << "*** Beginning execution, without double buffering\n";
+ break;
+ }
+ case 1: {
+ std::cout << "*** Beginning execution, with double buffering.\n";
+ break;
+ }
+ default: {
+ std::cout << "*** Beginning execution.\n";
+ }
+ }
+
+ // Start the timer. This will include the time to process the input data
+ // for the first 2 kernel executions.
+ dpc_common::TimeInterval exec_time;
+
+ if (i == 0) { // Single buffering
+ for (int i = 0; i < kTimes; i++) {
+ // Only print every few iterations, just to limit the prints.
+ if (i % 10 == 0) {
+ std::cout << "Launching kernel #" << i << "\n";
+ }
+
+ ProcessInput(input_buf[0]);
+ SimplePow(q, input_buf[0], output_buf[0], sycl_events[0]);
+ ProcessOutput(input_buf[0], output_buf[0], i, sycl_events[0],
+ total_kernel_time_per_slot[0]);
+ }
+ } else { // Double buffering
+ // Process input for first 2 kernel launches and queue them. Then block
+ // on processing the output of the first kernel.
+ ProcessInput(input_buf[0]);
+ ProcessInput(input_buf[1]);
+
+ std::cout << "Launching kernel #0\n";
+
+ SimplePow(q, input_buf[0], output_buf[0], sycl_events[0]);
+ for (int i = 1; i < kTimes; i++) {
+ if (i % 10 == 0) {
+ std::cout << "Launching kernel #" << i << "\n";
+ } // Only print every few iterations, just to limit the prints.
+
+ // Launch the next kernel
+ SimplePow(q, input_buf[i % 2], output_buf[i % 2], sycl_events[i % 2]);
+
+ // Process output from previous kernel. This will block on kernel
+ // completion.
+ ProcessOutput(input_buf[(i - 1) % 2], output_buf[(i - 1) % 2], i,
+ sycl_events[(i - 1) % 2],
+ total_kernel_time_per_slot[(i - 1) % 2]);
+
+ // Generate input for the next kernel.
+ ProcessInput(input_buf[(i - 1) % 2]);
+ }
+
+ // Process output of the final kernel
+ ProcessOutput(input_buf[(kTimes - 1) % 2], output_buf[(kTimes - 1) % 2],
+ i, sycl_events[(kTimes - 1) % 2],
+ total_kernel_time_per_slot[(kTimes - 1) % 2]);
+ }
+
+ // Add up the overall kernel execution time.
+ total_kernel_time = 0;
+ for (int i = 0; i < 2; i++) {
+ total_kernel_time += total_kernel_time_per_slot[i];
+ }
+
+ // Stop the timer.
+ double time_span = exec_time.Elapsed();
+
+ std::cout << "\nOverall execution time "
+ << ((i == 0) ? "without" : "with") << " double buffering = "
+ << (unsigned)(time_span * 1000) << " ms\n";
+ std::cout << "Total kernel-only execution time "
+ << ((i == 0) ? "without" : "with") << " double buffering = "
+ << (unsigned)(total_kernel_time / 1000000) << " ms\n";
+ std::cout << "Throughput = " << std::setprecision(8)
+ << (float)kSize * (float)kTimes * (float)sizeof(float) /
+ (float)time_span / 1000000
+ << " MB/s\n\n\n";
+ }
+ if (pass) {
+ std::cout << "Verification PASSED\n";
+ } else {
+ std::cout << "Verification FAILED\n";
+ return 1;
+ }
+ } catch (sycl::exception const& e) {
+ // Catches exceptions in the host code
+ std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+
+ // Most likely the runtime couldn't find FPGA hardware!
+ if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+ std::cout << "If you are targeting an FPGA, please ensure that your "
+ "system has a correctly configured FPGA board.\n";
+ std::cout << "If you are targeting the FPGA emulator, compile with "
+ "-DFPGA_EMULATOR.\n";
+ }
+ std::terminate();
+ }
+ return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/CMakeLists.txt
new file mode 100755
index 0000000000..134e6d8534
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+
+cmake_minimum_required (VERSION 2.8)
+
+project(NWayBuffering)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/README.md b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/README.md
new file mode 100755
index 0000000000..d4fb12ba40
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/README.md
@@ -0,0 +1,297 @@
+
+# N-Way Buffering to Overlap Kernel Execution with Buffer Transfers and Host Processing
+
+This FPGA tutorial demonstrates how to parallelize host-side processing and buffer transfers between host and device with kernel execution to improve overall application performance. It is a generalization of the 'double buffering' technique, and can be used to perform this overlap even when the host-processing time exceeds kernel execution time.
+
+***Documentation***: The [FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a resource for general target-independent DPC++ programming.
+
+| Optimized for | Description
+--- |---
+| OS | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit
+| What you will learn | How and when to apply the N-way buffering optimization technique
+| Time to complete | 30 minutes
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+N-Way buffering is a generalization of the double buffering optimization technique (see the "Double Buffering" FPGA tutorial). This system-level optimization enables kernel execution to occur in parallel with host-side processing and buffer transfers between host and device, improving application performance. N-way buffering can achieve this overlap even when the host-processing time exceeds kernel execution time.
+
+### Background
+
+In an application where the FPGA kernel is executed multiple-times, the host must perform the following processing and buffer transfers before each kernel invocation:
+1. The output data from the *previous* invocation must be transferred from the device to host and then processed by the host. Examples of this processing include the following:
+ * Copying the data to another location
+ * Rearranging the data
+ * Verifying it in some way
+2. The input data for the *next* invocation must be processed by the host and then transferred to the device. Examples of this processing include:
+ * Copying the data from another location
+ * Rearranging the data for kernel consumption
+ * Generating the data in some way
+
+Without the technique described in this tutorial, host processing and buffer transfers occur *between* kernel executions. Therefore, there is a gap in time between kernel executions, which you can refer to as kernel "downtime" (see diagram below). If these operations overlap with kernel execution, the kernels can execute back-to-back with minimal downtime, thereby increasing overall application performance.
+
+### N-Way Buffering
+
+This technique is referred to as *N-Way Buffering*, but is frequently called *double buffering* in the most common case where N=2.
+
+Let's first define some variables:
+
+| Variable | Description |
+| ------ | ------ |
+| **R** | Time to transfer the kernel's output buffer from device to host. |
+| **Op** | Host-side processing time of kernel output data (*output processing*). |
+| **Ip** | Host-side processing time for kernel input data (*input processing*). |
+| **W** | Time to transfer the kernel's input buffer from host to device. |
+| **K** | Kernel execution time. |
+| **N** | Number of buffer sets used. |
+| **C** | Number of host-side CPU cores. |
+
+
+
+
+
+In general, the **R**, **Op**, **Ip**, and **W** operations must all complete before the next kernel is launched. To maximize performance, while one kernel is executing on the device, these operations should run in parallel and operate on a separate set of buffer locations. You should complete before the current kernel completes, thus allowing the next kernel to be launched immediately with no downtime. In general, to maximize performance, the host must launch a new kernel every **K**.
+
+If these host-side operations are executed serially, this leads to the following constraint:
+
+```c++
+R + Op + Ip + W <= K, to minimize kernel downtime.
+```
+
+In the above example, if the constraint is satisfied, the application requires two sets of buffers. In this case, **N**=2.
+
+However, the above constraint may not be satisfied in some applications (i.e., if host-processing takes longer than the kernel execution time).
+
+**NOTE**: A performance improvement may still be observed because kernel downtime may still be reduced (though perhaps not maximally reduced).
+
+In this case, to further improve performance, the reduce host-processing time through multi-threading. Rather than executing the above operations serially, perform the input- and output-processing operations in parallel using two threads, leading to the following constraint:
+
+```c++
+Max (R+Op, Ip+W) <= K
+and
+R + W <= K, to minimize kernel downtime.
+````
+
+If the above constraint is still unsatisfied, the technique can be extended beyond two sets of buffers to **N** sets of buffers to help improve the degree of overlap. In this case, the constraint becomes:
+
+```c++
+Max (R + Op, Ip + W) <= (N-1)*K
+and
+R + W <= K, to minimize kernel downtime.
+```
+
+The idea of N-way buffering is to prepare **N** sets of kernel input buffers, launch **N** kernels, and when the first kernel completes, begin the subsequent host-side operations. These operations may take a long time (longer than **K**), but they do not cause kernel downtime because an additional **N**-1 kernels have already been queued and can launch immediately. By the time these first **N** kernels complete, the aforementioned host-side operations would have also completed and the **N**+1 kernel can be launched with no downtime. As additional kernels complete, corresponding host-side operations are launched on the host, in a parallel fashion, using multiple threads. Although the host operations take longer than **K**, if **N** is chosen correctly, they will complete with a period of **K**, which is required to ensure we can launch a new kernel every **K**. To reiterate, this scheme requires multi-threaded host-operations because the host must perform processing for up to **N** kernels in parallel in order to keep up.
+
+The above formula can be used to calculate the **N** required to minimize downtime. However, there are some practical limits:
+* **N** sets of buffers are required on both the host and device, therefore both must have the capacity for this many buffers.
+* If the input and output processing operations are launched in separate threads, then (**N**-1)*2 cores are required, so **C** can be become the limiting factor.
+
+### Measuring the Impact of N-Way Buffering
+
+You must get a sense of the kernel downtime to identify the degree to which this technique can help improve performance.
+
+This can be done by querying total kernel execution time from the runtime and comparing it to with overall application execution time. In an application where kernels execute with minimal downtime, these two numbers are close. However, if kernels have a lot of downtime, overall execution time notably exceeds the kernel execution time. The tutorial code exemplifies how to do this.
+
+### Tutorial Implementation Notes
+
+The example code runs with multiple iterations to illustrate how performance improves as **N** increases and as multi-threading is used.
+
+It is useful to think of the execution space as having **N** slots where the slots execute in chronological order, and each slot has its own set of buffers on the host and device. At the beginning of execution, the host prepares the kernel input data for the **N** slots and launches **N** kernels. When slot-0 completes, slot-1 begins executing immediately because it was already queued. The host begins both the output and input processing for slot-0. These two operations must complete before the host can queue another kernel into slot-0. The same is true for all slots.
+
+After each kernel is launched, the host-side operations (that occur *after* the kernel in that slot completes) are launched immediately from the `main()` program. They block until the kernel execution for that slot completes (this is enforced by the runtime).
+
+
+## Key Concepts
+* The N-way buffering optimization technique as a generalization of double buffering
+* Determining when N-way buffering is practical and beneficial
+* How to measure the impact of N-way buffering
+
+## License
+This code sample is licensed under MIT license.
+
+
+## Building the `n_way_buffering` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+ ```
+ mkdir build
+ cd build
+ ```
+ To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:
+ ```
+ cmake ..
+ ```
+ Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+ ```
+ cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+ ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ make fpga_emu
+ ```
+ * Generate the optimization report:
+ ```
+ make report
+ ```
+ * Compile for FPGA hardware (longer compile time, targets FPGA device):
+ ```
+ make fpga
+ ```
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead.
+
+1. Enter the source file directory.
+ ```
+ cd src
+ ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ ninja fpga_emu
+ ```
+
+ * Generate the optimization report:
+
+ ```
+ ninja report
+ ```
+ If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+ ```
+ ninja report_s10_pac
+ ```
+ * Compiling for FPGA hardware is not yet supported on Windows.
+
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Examining the Reports
+Locate `report.html` in the `n_way_buffering_report.prj/reports/` or `n_way_buffering_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+Note that because the optimization described in this tutorial takes place at the *runtime* level, the FPGA compiler report will not show a difference between the optimized and unoptimized cases.
+
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+ ```
+ ./n_way_buffering.fpga_emu (Linux)
+ n_way_buffering.fpga_emu.exe (Windows)
+ ```
+2. Run the sample on the FPGA device:
+ ```
+ ./n_way_buffering.fpga (Linux)
+ ```
+
+### Example of Output
+
+```
+Platform name: Intel(R) FPGA SDK for OpenCL(TM)
+Device name: pac_a10 : Intel PAC Platform (pac_ec00000)
+
+
+Executing kernel 100 times in each round.
+
+*** Beginning execution, 1-way buffering, single-threaded host operations
+Launching kernel #0
+Launching kernel #10
+Launching kernel #20
+Launching kernel #30
+Launching kernel #40
+Launching kernel #50
+Launching kernel #60
+Launching kernel #70
+Launching kernel #80
+Launching kernel #90
+
+Overall execution time = 65915 ms
+Total kernel-only execution time = 17852 ms
+Throughput = 15.907802 MB/s
+
+
+*** Beginning execution, 1-way buffering, multi-threaded host operations.
+Launching kernel #0
+Launching kernel #10
+Launching kernel #20
+Launching kernel #30
+Launching kernel #40
+Launching kernel #50
+Launching kernel #60
+Launching kernel #70
+Launching kernel #80
+Launching kernel #90
+
+Overall execution time = 51814 ms
+Total kernel-only execution time = 17852 ms
+Throughput = 20.237082 MB/s
+
+
+*** Beginning execution, 2-way buffering, multi-threaded host operationss
+Launching kernel #0
+Launching kernel #10
+Launching kernel #20
+Launching kernel #30
+Launching kernel #40
+Launching kernel #50
+Launching kernel #60
+Launching kernel #70
+Launching kernel #80
+Launching kernel #90
+
+Overall execution time = 26109 ms
+Total kernel-only execution time = 17852 ms
+Throughput = 40.160442 MB/s
+
+
+*** Beginning execution, N=5-way buffering, multi-threaded host operations
+Launching kernel #0
+Launching kernel #10
+Launching kernel #20
+Launching kernel #30
+Launching kernel #40
+Launching kernel #50
+Launching kernel #60
+Launching kernel #70
+Launching kernel #80
+Launching kernel #90
+
+Overall execution time with N-way buffering = 18763 ms
+Total kernel-only execution time with N-way buffering = 17851 ms
+Throughput = 55.884682 MB/s
+
+
+Verification PASSED
+```
+
+### Discussion of Results
+
+A test compile of this tutorial design achieved an fMAX of approximately 340 MHz on the Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA. The results are shown in the following table:
+
+Configuration | Overall Execution Time (ms) | Total Kernel Execution time (ms)
+-|-|-
+1-way buffering, single-threaded | 64401 | 15187
+1-way buffering, multi-threaded | 53540 | 15187
+2-way buffering, multi-threaded | 27281 | 15187
+5-way buffering, multi-threaded | 16284 | 15188
+
+In all runs, the total kernel execution time is similar, as expected. In the first three configurations, the overall execution time notably exceeds the total kernel execution time, implying there is downtime between kernel executions. However, as we switch from single-threaded to multi-threaded host operations and increase the number of buffer sets used, the overall execution time approaches the kernel execution time.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/downtime.png b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/downtime.png
new file mode 100755
index 0000000000..2a306929bc
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/downtime.png differ
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.sln b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.sln
new file mode 100755
index 0000000000..5a77b3049a
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.sln
@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "n_way_buffering", "n_way_buffering.vcxproj", "{49E7063B-56DA-4ACF-B153-5B56A98645BE}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|x64 = Debug|x64
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {49E7063B-56DA-4ACF-B153-5B56A98645BE}.Debug|x64.ActiveCfg = Debug|x64
+ {49E7063B-56DA-4ACF-B153-5B56A98645BE}.Debug|x64.Build.0 = Debug|x64
+ {49E7063B-56DA-4ACF-B153-5B56A98645BE}.Release|x64.ActiveCfg = Release|x64
+ {49E7063B-56DA-4ACF-B153-5B56A98645BE}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ SolutionGuid = {CC320E26-0D79-434A-8E69-3F09BFB2FCF4}
+ EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.vcxproj
new file mode 100755
index 0000000000..dff6f99529
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.vcxproj
@@ -0,0 +1,160 @@
+
+
+
+
+ Debug
+ x64
+
+
+ Release
+ x64
+
+
+
+
+
+
+
+
+
+ 15.0
+ {49e7063b-56da-4acf-b153-5b56a98645be}
+ Win32Proj
+ n_way_buffering
+ $(WindowsSDKVersion.Replace("\",""))
+
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true
+
+
+ true
+
+
+ false
+
+
+ false
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR %(AdditionalOptions)
+ $(IntDir)n_way_buffering.obj
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR %(AdditionalOptions)
+ $(IntDir)n_way_buffering.obj
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/sample.json
new file mode 100755
index 0000000000..dffbded768
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/sample.json
@@ -0,0 +1,51 @@
+{
+ "guid": "2100C9BD-331C-475B-9878-4D14AAF0981D",
+ "name": "Overlapping Kernel Execution with Buffer Transfers and Host-Processing through N-Way Buffering",
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+ "description": "FPGA tutorial design to demonstrate overlapping kernel execution with buffer transfers and multi-threaded host-processing to improve system performance",
+ "toolchain": ["dpcpp"],
+ "os": ["linux", "windows"],
+ "targetDevice": ["FPGA"],
+ "builder": ["ide", "cmake"],
+ "languages": [{"cpp":{}}],
+ "ciTests": {
+ "linux": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make fpga_emu",
+ "./n_way_buffering.fpga_emu"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make report"
+ ]
+ }
+ ],
+ "windows": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "cd src",
+ "ninja fpga_emu",
+ "n_way_buffering.fpga_emu.exe"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "cd src",
+ "ninja report"
+ ]
+ }
+ ]
+ }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/CMakeLists.txt
new file mode 100755
index 0000000000..cf12b30f72
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/CMakeLists.txt
@@ -0,0 +1,93 @@
+set(SOURCE_FILE n_way_buffering.cpp)
+set(TARGET_NAME n_way_buffering)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS " -lpthread -fintelfpga")
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+ MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+ MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+ MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+ SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+ MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+# fpga emulator
+if(WIN32)
+ set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+ add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+ separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+ add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+ DEPENDS ${SOURCE_FILE})
+else()
+ add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+ add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+ set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+ set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpga
+if(WIN32)
+ add_custom_target(fpga
+ COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+set(FPGA_OBJ_FILE "dev_fpga.o")
+ add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+
+ add_custom_command(OUTPUT ${FPGA_OBJ_FILE}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} -fintelfpga -c ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${FPGA_OBJ_FILE}
+ DEPENDS ${SOURCE_FILE})
+
+ separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+ add_custom_command(OUTPUT ${FPGA_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} ${FPGA_OBJ_FILE} -o ${CMAKE_BINARY_DIR}/${FPGA_TARGET} -lpthread
+ DEPENDS ${FPGA_OBJ_FILE})
+endif()
+
+
+# report
+if(WIN32)
+ set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+ add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+ separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+ add_custom_command(OUTPUT ${DEVICE_OBJ_FILE}
+ COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+ DEPENDS ${SOURCE_FILE})
+
+else()
+ set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+ add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+
+ separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+ add_custom_command(OUTPUT ${DEVICE_OBJ_FILE}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+ DEPENDS ${SOURCE_FILE})
+endif()
+
+# run
+add_custom_target(run
+ COMMAND ../${TARGET_NAME}.fpga_emu
+ DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/build.ninja
new file mode 100755
index 0000000000..80284aff9b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/build.ninja
@@ -0,0 +1,30 @@
+source_file = n_way_buffering.cpp
+target_name = n_way_buffering
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+ command = dpcpp /GX ${emulator_flags} $in -o $out
+
+rule gen_report
+ command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+ command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/n_way_buffering.cpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/n_way_buffering.cpp
new file mode 100755
index 0000000000..c5428348db
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/n_way_buffering.cpp
@@ -0,0 +1,437 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include
+#include
+#include
+#include
+#include
+#include "dpc_common.hpp"
+
+using namespace sycl;
+
+// N-way buffering. N must be >= 1.
+constexpr int kLocalN = 5;
+
+// # times to execute the kernel. kTimes must be >= kLocalN
+#if defined(FPGA_EMULATOR)
+constexpr int kTimes = 20;
+#else
+constexpr int kTimes = 100;
+#endif
+
+// # of floats to process on each kernel execution.
+#if defined(FPGA_EMULATOR)
+constexpr int kSize = 4096;
+#else
+constexpr int kSize = 2621440; // ~10MB
+#endif
+
+// Kernel executes a power function (base^kPow). Must be
+// >= 2. Can increase this to increase kernel execution
+// time, but ProcessOutput() time will also increase.
+constexpr int kPow = 20;
+
+// Number of iterations through the main loop
+constexpr int kNumRuns = 4;
+
+bool pass = true;
+
+class SimpleVpow;
+
+/* Kernel function.
+ Performs buffer_b[i] = buffer_a[i] ** pow
+ Only supports pow >= 2.
+ This kernel is not meant to be an optimal implementation of the power
+ operation -- it's just a sample kernel for this tutorial whose execution time
+ is easily controlled via the pow parameter. SYCL buffers are created
+ externally and passed in by reference to control (external to this function)
+ when the buffers are destructed. The destructor causes a blocking buffer
+ transfer from device to host and N-way buffering requires us to not block
+ here (because we need to queue more kernels). So we only want this transfer
+ to occur at the end of overall execution, not at the end of each individual
+ kernel execution.
+*/
+void SimplePow(std::unique_ptr &q, buffer &buffer_a,
+ buffer &buffer_b, event &e) {
+ // Submit to the queue and execute the kernel
+ e = q->submit([&](handler &h) {
+ // Get kernel access to the buffers
+ auto accessor_a = buffer_a.get_access(h);
+ auto accessor_b = buffer_b.get_access(h);
+
+ const int num = kSize;
+ const int p = kPow - 1; // Assumes pow >= 2;
+ assert(kPow >= 2);
+
+ h.single_task([=]() [[intel::kernel_args_restrict]] {
+ for (int j = 0; j < p; j++) {
+ if (j == 0) {
+ for (int i = 0; i < num; i++) {
+ accessor_b[i] = accessor_a[i] * accessor_a[i];
+ }
+ } else {
+ for (int i = 0; i < num; i++) {
+ accessor_b[i] = accessor_b[i] * accessor_a[i];
+ }
+ }
+ }
+ });
+ });
+
+ event update_host_event;
+ update_host_event = q->submit([&](handler &h) {
+ auto accessor_b = buffer_b.get_access(h);
+
+ /*
+ Explicitly instruct the SYCL runtime to copy the kernel's output buffer
+ back to the host upon kernel completion. This is not required for
+ functionality since the buffer access in ProcessOutput() also implicitly
+ instructs the runtime to copy the data back. But it should be noted that
+ this buffer access blocks ProcessOutput() until the kernel is complete
+ and the data is copied. In contrast, update_host() instructs the runtime
+ to perform the copy earlier. This allows ProcessOutput() to optionally
+ perform more useful work *before* making the blocking buffer access. Said
+ another way, this allows ProcessOutput() to potentially perform more work
+ in parallel with the runtime's copy operation.
+ */
+ h.update_host(accessor_b);
+ });
+
+}
+
+// Returns kernel execution time for a given SYCL event from a queue.
+ulong SyclGetExecTimeNs(event e) {
+ ulong start_time =
+ e.get_profiling_info();
+ ulong end_time =
+ e.get_profiling_info();
+ return (end_time - start_time);
+}
+
+// Local pow function for verifying results
+float MyPow(float input, int pow) {
+ return (pow == 0) ? 1 : input * MyPow(input, pow - 1);
+}
+
+/* Compares kernel output against expected output.
+ Grabs kernel output data from its SYCL buffer. Reading from this buffer is a
+ blocking operation that will block on the kernel completing. Grabs expected
+ output from a host-side copy of the input data. A copy is used to allow for
+ parallel generation of the input data for the next execution. Queries and
+ records execution time of the kernel that just completed. This is a natural
+ place to do this because ProcessOutput() is blocked on kernel completion.
+*/
+void ProcessOutput(buffer &output_buf,
+ std::vector &input_copy, int exec_number, event e,
+ ulong &total_kernel_time_per_slot) {
+ auto output_buf_acc = output_buf.get_access();
+ int num_errors = 0;
+ int num_errors_to_print = 10;
+
+ /* The use of update_host() in the kernel function allows for additional
+ host-side operations to be performed here, in parallel with the buffer copy
+ operation from device to host, before the blocking access to the output
+ buffer is made via output_buf_acc[]. To be clear, no real operations are
+ done here and this is just a note that this is the place
+ where you *could* do it. */
+ for (int i = 0; i < kSize; i++) {
+ bool out_valid = (MyPow(input_copy.data()[i], kPow) != output_buf_acc[i]);
+ if ((num_errors < num_errors_to_print) && out_valid) {
+ if (num_errors == 0) {
+ pass = false;
+ std::cout << "Verification failed on kernel execution # " << exec_number
+ << ". Showing up to " << num_errors_to_print
+ << " mismatches.\n";
+ }
+ std::cout << "Verification failed on kernel execution # " << exec_number
+ << ", at element " << i << ". Expected " << std::fixed
+ << std::setprecision(16) << MyPow(input_copy.data()[i], kPow)
+ << " but got " << output_buf_acc[i] << "\n";
+ num_errors++;
+ }
+ }
+
+ // At this point we know the kernel has completed, so can query the profiling
+ // data.
+ total_kernel_time_per_slot += SyclGetExecTimeNs(e);
+}
+
+/*
+ Generates input data for the next kernel execution.
+ Writes the data into the associated SYCL buffer. The write will block until
+ the previous kernel execution, that is using this buffer, completes. Writes a
+ copy of the data into a host-side buffer that will later be used by
+ ProcessOutput().
+*/
+void ProcessInput(buffer &buf, std::vector ©) {
+ // We are generating completely new input data, so can use discard_write()
+ // here to indicate we don't care about the SYCL buffer's current contents.
+ auto buf_acc = buf.get_access();
+
+ // RNG seed
+ auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+
+ // RNG engine
+ std::default_random_engine dre(seed);
+
+ // Values between 1 and 2
+ std::uniform_real_distribution di(1.0f, 2.0f);
+
+ // Randomly generate a start value and increment from there.
+ // Compared to randomly generating every value, this is done to
+ // speed up this function a bit.
+ float start_val = di(dre);
+
+ for (int i = 0; i < kSize; i++) {
+ buf_acc[i] = start_val;
+ copy.data()[i] = start_val;
+ start_val++;
+ }
+}
+
+int main() {
+// Create queue, get platform and device
+#if defined(FPGA_EMULATOR)
+ intel::fpga_emulator_selector device_selector;
+ std::cout << "\nEmulator output does not demonstrate true hardware "
+ "performance. The design may need to run on actual hardware "
+ "to observe the performance benefit of the optimization "
+ "exemplified in this tutorial.\n\n";
+#else
+ intel::fpga_selector device_selector;
+#endif
+
+ try {
+ auto prop_list =
+ property_list{property::queue::enable_profiling()};
+
+ std::unique_ptr q;
+ q.reset(new queue(device_selector, dpc_common::exception_handler, prop_list));
+
+ platform platform = q->get_context().get_platform();
+ device device = q->get_device();
+ std::cout << "Platform name: "
+ << platform.get_info().c_str() << "\n";
+ std::cout << "Device name: "
+ << device.get_info().c_str() << "\n\n\n";
+
+ std::cout << "Executing kernel " << kTimes << " times in each round.\n\n";
+
+ // Create a vector to store the input/output SYCL buffers
+ std::vector> input_buf;
+ std::vector> output_buf;
+
+ // For every execution slot, we need 2 host-side buffers
+ // to store copies of the input data. One is used to
+ // verify the previous kernel's output. The other stores
+ // the new data for the next kernel execution.
+ std::vector input_buf_copy[2 * kLocalN];
+
+ // SYCL events for each kernel launch.
+ event sycl_events[kLocalN];
+
+ // In nanoseconds. Total execution time of kernels in a given slot.
+ ulong total_kernel_time_per_slot[kLocalN];
+
+ // Total execution time of all kernels.
+ ulong total_kernel_time = 0;
+
+ // Threads to process the output from each kernel
+ std::thread t_process_output[kLocalN];
+
+ // Threads to process the input data for the next kernel
+ std::thread t_process_input[kLocalN];
+
+ // Demonstrate with 1-way buffering first, then N-way buffering.
+ int N;
+
+ // st = "single threaded".
+ // Used to enable multi-threading in subsequent runs.
+ bool st = true;
+
+ // Allocate vectors to store the host-side copies of the input data
+ for (int i = 0; i < 2 * kLocalN; i++) {
+ input_buf_copy[i] = std::vector(kSize);
+ }
+
+ // Create and allocate the SYCL buffers
+ for (int i = 0; i < kLocalN; i++) {
+ input_buf.push_back(buffer(range<1>(kSize)));
+ output_buf.push_back(buffer(range<1>(kSize)));
+ }
+
+ /*
+ Main loop.
+ This loop runs multiple times to demonstrate how performance can be
+ improved by increasing the number of buffers as well as multi-threading
+ the host-side operations. The first iteration is a base run, demonstrating
+ the performance with none of these optimizations (ie. 1-way buffering,
+ single-threaded).
+ */
+ for (int i = 0; i < kNumRuns; i++) {
+ for (int i = 0; i < kLocalN; i++) {
+ total_kernel_time_per_slot[i] = 0; // Initialize timers to zero.
+ }
+
+ switch (i) {
+ case 0: {
+ std::cout << "*** Beginning execution, 1-way buffering, "
+ "single-threaded host operations\n";
+ N = 1;
+ st = true;
+ break;
+ }
+ case 1: {
+ std::cout << "*** Beginning execution, 1-way buffering, "
+ "multi-threaded host operations.\n";
+ N = 1;
+ st = false;
+ break;
+ }
+ case 2: {
+ std::cout << "*** Beginning execution, 2-way buffering, "
+ "multi-threaded host operationss\n";
+ N = 2;
+ st = false;
+ break;
+ }
+ case 3: {
+ std::cout << "*** Beginning execution, N=" << kLocalN
+ << "-way buffering, multi-threaded host operations\n";
+ N = kLocalN;
+ st = false;
+ break;
+ }
+ default:
+ std::cout << "*** Beginning execution.\n";
+ }
+
+ // Start the timer. This will include the time to process the
+ // input data for the first N kernel executions.
+ dpc_common::TimeInterval exec_time;
+
+ // Process the input data for first N kernel executions. For
+ // multi-threaded runs, this is done in parallel.
+ for (int i = 0; i < N; i++) {
+ t_process_input[i] = std::thread(ProcessInput, std::ref(input_buf[i]),
+ std::ref(input_buf_copy[i]));
+ if (st) {
+ t_process_input[i].join();
+ }
+ }
+
+ /*
+ It's useful to think of the kernel execution space as having N slots.
+ Conceptually, the slots are executed chronologically sequentially on the
+ device (i.e. slot 0 to N-1). Each slot has its own buffering on both the
+ host and device. Before launching a kernel in a given slot, we must
+ process output data from the previous execution that occurred in that
+ slot and process new input data for the upcoming new execution in that
+ slot.
+ */
+ for (int i = 0; i < kTimes; i++) {
+ // The current slot is i%N.
+ // Before each kernel launch, the ProcessOutput() must have completed
+ // for the last execution in this slot. The ProcessInput() must also
+ // have completed for the upcoming new execution for this slot. Block on
+ // both of these.
+ if (!st) {
+ // ProcessOutput() is only relevant after the
+ // first N kernels have been launched.
+ if (i >= N) {
+ t_process_output[i % N].join();
+ }
+
+ t_process_input[i % N].join();
+ }
+
+ // Launch the kernel. This is non-blocking with respect to main().
+ // Only print every few iterations, just to limit the prints.
+ if (i % 10 == 0) {
+ std::cout << "Launching kernel #" << i << "\n";
+ }
+
+ SimplePow(q, input_buf[i % N], output_buf[i % N], sycl_events[i % N]);
+
+ // Immediately launch threads for the ProcessOutput() and
+ // ProcessInput() for *this* slot. These are non-blocking with respect
+ // to main(), but they will individually be blocked until the
+ // corresponding kernel execution is complete. The ProcessOutput()
+ // compares the kernel output data against the input data. But
+ // ProcessInput() will be overwriting that input data in parallel.
+ // Therefore ProcessOutput() must compare against an older copy of the
+ // data. We ping-pong between host-side copies of the input data.
+ t_process_output[i % N] = std::thread(
+ ProcessOutput, std::ref(output_buf[i % N]),
+ std::ref(input_buf_copy[i % (2 * N)]), i, sycl_events[i % N],
+ std::ref(total_kernel_time_per_slot[i % N]));
+
+ // For single-threaded runs, force single-threaded operation by
+ // blocking here immediately.
+ if (st) {
+ t_process_output[i % N].join();
+ }
+
+ // For the final N kernel launches, no need to process
+ // input data because there will be no more launches.
+ if (i < kTimes - N) {
+ // The indexes for the input_buf_copy used by ProcessOutput() and
+ // ProcessInput() are spaced N apart.
+ t_process_input[i % N] =
+ std::thread(ProcessInput, std::ref(input_buf[i % N]),
+ std::ref(input_buf_copy[(i + N) % (2 * N)]));
+
+ if (st) {
+ t_process_input[i % N].join();
+ }
+ }
+ }
+
+ // Wait for the final N threads to finish and add up the overall kernel
+ // execution time.
+ total_kernel_time = 0;
+ for (int i = 0; i < N; i++) {
+ if (!st) {
+ t_process_output[i].join();
+ }
+ total_kernel_time += total_kernel_time_per_slot[i];
+ }
+
+ // Stop the timer.
+ double time_span = exec_time.Elapsed();
+
+ std::cout << "\nOverall execution time "
+ << ((i == kNumRuns - 1) ? ("with N-way buffering ") : "")
+ << "= " << (unsigned)(time_span * 1000) << " ms\n";
+ std::cout << "Total kernel-only execution time "
+ << ((i == kNumRuns - 1) ? ("with N-way buffering ") : "")
+ << "= " << (unsigned)(total_kernel_time / 1000000) << " ms\n";
+ std::cout << "Throughput = " << std::setprecision(8)
+ << (float)kSize * (float)kTimes * (float)sizeof(float) /
+ (float)time_span / 1000000
+ << " MB/s\n\n\n";
+ }
+ if (pass) {
+ std::cout << "Verification PASSED\n";
+ } else {
+ std::cout << "Verification FAILED\n";
+ return 1;
+ }
+ } catch (sycl::exception const& e) {
+ // Catches exceptions in the host code
+ std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+
+ // Most likely the runtime couldn't find FPGA hardware!
+ if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+ std::cout << "If you are targeting an FPGA, please ensure that your "
+ "system has a correctly configured FPGA board.\n";
+ std::cout << "If you are targeting the FPGA emulator, compile with "
+ "-DFPGA_EMULATOR.\n";
+ }
+ std::terminate();
+ }
+ return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/CMakeLists.txt
new file mode 100755
index 0000000000..4835f73b5f
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(LocalMemoryCache)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/README.md b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/README.md
new file mode 100755
index 0000000000..8a974787e4
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/README.md
@@ -0,0 +1,189 @@
+# Caching On-Chip Memory to Improve Loop Performance
+This FPGA tutorial demonstrates how to build a simple cache (implemented in FPGA registers) to store recently-accessed memory locations so that the compiler can achieve II=1 on critical loops in task kernels.
+
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming.
+
+| Optimized for | Description
+--- |---
+| OS | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA;
Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software | Intel® oneAPI DPC++ Compiler (Beta)
Intel® FPGA Add-On for oneAPI Base Toolkit
+| What you will learn | How and when to implement the on-chip memory cache optimization
+| Time to complete | 30 minutes
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+In DPC++ task kernels for FPGA, it is always our objective to achieve an initiation interval (II) of 1 on performance-critical loops. This means that a new loop iteration is launched on every clock cycle, maximizing the throughput of the loop.
+
+When the loop contains a loop-carried variable that is implemented in on-chip memory, the compiler often *cannot* achieve II=1 because the memory access takes more than one clock cycle. If the updated memory location may be needed on the next loop iteration, the next iteration must be delayed to allow time for the update, hence II > 1.
+
+The on-chip memory cache technique breaks this dependency by storing recently-accessed values in a cache capable of a 1-cycle read-modify-write operation. The cache is implemented in FPGA registers rather than on-chip memory. By pulling memory accesses preferentially from the register cache, the loop-carried dependency is broken.
+
+### When is the on-chip memory cache technique applicable?
+
+***Failure to achieve II=1 because of a loop-carried memory dependency in on-chip memory***:
+The on-chip memory cache technique is applicable if compiler could not pipeline a loop with II=1 because of an on-chip memory dependency. (If the compiler could not achieve II=1 because of a *global* memory dependency, this technique does not apply as the access latencies are too great.)
+
+To check this for a given design, view the "Loops Analysis" section of its optimization report. The report lists the II of all loops and explains why a lower II is not achievable. Check whether the reason given resembles "the compiler failed to schedule this loop with smaller II due to memory dependency". The report will describe the "most critical loop feedback path during scheduling". Check whether this includes on-chip memory load/store operations on the critical path.
+
+***An II=1 loop with a load operation of latency 1***:
+The compiler is capable of reducing the latency of on-chip memory accesses in order to achieve II=1. However, in doing so the compiler makes a trade-off, sacrificing fMAX to better optimize the loop.
+
+In a design with II=1 critical loops but lower than desired fMAX, the on-chip memory cache technique may still be applicable. It can help recover fMAX by enabling the compiler to achieve II=1 with a higher latency memory access.
+
+To check whether this is the case for a given design, view the "Kernel Memory Viewer" section of the optimization report. Select the on-chip memory of interest from the Kernel Memory List, and mouse over the load operation "LD" to check its latency. If the latency of the load operation is 1, this is a clear sign that the compiler has attempted to sacrifice fMAX to better optimize a loop.
+
+
+### Implementing the on-chip memory cache technique
+
+The tutorial demonstrates the technique using a program that computes a histogram. The histogram operation accepts an input vector of values, separates the values into buckets, and counts the number of values per bucket. For each input value, an output bucket location is determined, and the count for the bucket is incremented. This count is stored in the on-chip memory and the increment operation requires reading from the memory, performing the increment, and storing the result. This read-modify-write operation is the critical path that can result in II > 1.
+
+To reduce II, the idea is to store recently-accessed values in an FPGA register-implemented cache that is capable of a 1-cycle read-modify-write operation. If the memory location required on a given iteration exists in the cache, it is pulled from there. The updated count is written back to *both* the cache and the on-chip memory. The `ivdep` pragma is added to inform the compiler that if a loop-carried variable (namely, the variable storing the histogram output) is needed within `CACHE_DEPTH` iterations, it is guaranteed to be available right away.
+
+### Selecting the cache depth
+
+While any value of `CACHE_DEPTH` results in functional hardware, the ideal value of `CACHE_DEPTH` requires some experimentation. The depth of the cache needs to roughly cover the latency of the on-chip memory access. To determine the correct value, it is suggested to start with a value of 2 and then increase it until both II = 1 and load latency > 1. In this tutorial, a `CACHE_DEPTH` of 5 is needed.
+
+Each iteration takes only a few moments by running `make report` (refer to the section below on how to build the design). It is important to find the *minimal* value of `CACHE_DEPTH` that results in a maximal performance increase. Unnecessarily large values of `CACHE_DEPTH` consume unnecessary FPGA resources and can reduce fMAX. Therefore, at a `CACHE_DEPTH` that results in II=1 and load latency = 1, if further increases to `CACHE_DEPTH` show no improvement, then `CACHE_DEPTH` should not be increased any further.
+
+In the tutorial, two versions of the histogram kernel are implemented: one with and one without caching. The report shows II > 1 for the loop in the kernel without caching and II = 1 for the one with caching.
+
+## Key Concepts
+* How to implement the on-chip memory cache optimization technique
+* The scenarios in which this technique benefits performance
+* How to tune the cache depth
+
+## License
+This code sample is licensed under MIT license.
+
+
+## Building the `onchip_memory_cache` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+ ```
+ mkdir build
+ cd build
+ ```
+ To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:
+ ```
+ cmake ..
+ ```
+ Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+ ```
+ cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+ ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ make fpga_emu
+ ```
+ * Generate the optimization report:
+ ```
+ make report
+ ```
+ * Compile for FPGA hardware (longer compile time, targets FPGA device):
+ ```
+ make fpga
+ ```
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded here.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead.
+
+1. Enter the source file directory.
+ ```
+ cd src
+ ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+ * Compile for emulation (fast compile time, targets emulated FPGA device):
+ ```
+ ninja fpga_emu
+ ```
+
+ * Generate the optimization report:
+
+ ```
+ ninja report
+ ```
+ If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+ ```
+ ninja report_s10_pac
+ ```
+ * Compiling for FPGA hardware is not yet supported on Windows.
+
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+
+## Examining the Reports
+Locate `report.html` in the `onchip_memory_cache_report.prj/reports/` or `onchip_memory_cache_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+Compare the Loop Analysis reports with and without the onchip memory cache optimization, as described in the "When is the on-chip memory cache technique applicable?" section.
+
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+ ```
+ ./onchip_memory_cache.fpga_emu (Linux)
+ onchip_memory_cache.fpga_emu.exe (Windows)
+ ```
+2. Run the sample on the FPGA device:
+ ```
+ ./onchip_memory_cache.fpga (Linux)
+ ```
+
+### Example of Output
+
+```
+Platform name: Intel(R) FPGA SDK for OpenCL(TM)
+Device name: pac_a10 : Intel PAC Platform (pac_ee00000)
+
+
+Number of inputs: 16777216
+Number of outputs: 64
+
+Beginning run without local memory caching.
+
+Verification PASSED
+
+Kernel execution time: 0.114106 seconds
+Kernel throughput without caching: 560.884047 MB/s
+
+Beginning run with local memory caching.
+
+Verification PASSED
+
+Kernel execution time: 0.059061 seconds
+Kernel throughput with caching: 1083.623184 MB/s
+```
+
+### Discussion of Results
+
+A test compile of this tutorial design achieved an fMAX of approximately 250 MHz on the Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA. The results are shown in the following table:
+
+Configuration | Execution Time (ms) | Throughput (MB/s)
+-|-|-
+Without caching | 0.153 | 418
+With caching | 0.08 | 809
+
+When caching is used, performance notably increases. As previously mentioned, this technique should result in an II reduction, which should lead to a throughput improvement. The technique can also improve fMAX if the compiler had previously implemented a latency=1 load operation, in which case the fMAX increase should result in a further throughput improvement.
+
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.sln b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.sln
new file mode 100755
index 0000000000..3df819f016
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.sln
@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "onchip_memory_cache", "onchip_memory_cache.vcxproj", "{66A01391-21D2-46BB-A37A-6B8670BEE1FC}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|x64 = Debug|x64
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {66A01391-21D2-46BB-A37A-6B8670BEE1FC}.Debug|x64.ActiveCfg = Debug|x64
+ {66A01391-21D2-46BB-A37A-6B8670BEE1FC}.Debug|x64.Build.0 = Debug|x64
+ {66A01391-21D2-46BB-A37A-6B8670BEE1FC}.Release|x64.ActiveCfg = Release|x64
+ {66A01391-21D2-46BB-A37A-6B8670BEE1FC}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(ExtensibilityGlobals) = postSolution
+ SolutionGuid = {E3206292-E99D-4ADC-B428-E0557E8070D4}
+ EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.vcxproj
new file mode 100755
index 0000000000..940683894e
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.vcxproj
@@ -0,0 +1,160 @@
+
+
+
+
+ Debug
+ x64
+
+
+ Release
+ x64
+
+
+
+
+
+
+
+
+
+ 15.0
+ {66a01391-21d2-46bb-a37a-6b8670bee1fc}
+ Win32Proj
+ onchip_memory_cache
+ $(WindowsSDKVersion.Replace("\",""))
+
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+ Application
+ true
+ Intel(R) oneAPI DPC++ Compiler
+ Unicode
+
+
+ Application
+ false
+ Intel(R) oneAPI DPC++ Compiler
+ true
+ Unicode
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ true
+
+
+ true
+
+
+ false
+
+
+ false
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+ Use
+ Level3
+ Disabled
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR %(AdditionalOptions)
+ $(IntDir)onchip_memory_cache.obj
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+ Use
+ Level3
+ MaxSpeed
+ true
+ true
+ true
+ true
+ pch.h
+ true
+ -DFPGA_EMULATOR %(AdditionalOptions)
+ $(IntDir)onchip_memory_cache.obj
+ $(ONEAPI_ROOT)dev-utilities\latest\include
+
+
+ Console
+ true
+ true
+ true
+
+
+
+
+
+
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/sample.json
new file mode 100755
index 0000000000..a35ba679ac
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/sample.json
@@ -0,0 +1,51 @@
+{
+ "guid": "93DA332C-5490-4E4B-8038-BDEC1662A2D0",
+ "name": "Caching On-Chip Memory to Improve Loop Performance",
+ "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+ "description": "FPGA tutorial demonstrating the caching of on-chip memory to reduce loop initiation interval.",
+ "toolchain": ["dpcpp"],
+ "os": ["linux", "windows"],
+ "builder": ["ide", "cmake"],
+ "targetDevice": ["FPGA"],
+ "languages": [{"cpp":{}}],
+ "ciTests": {
+ "linux": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make fpga_emu",
+ "./onchip_memory_cache.fpga_emu"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "mkdir build",
+ "cd build",
+ "cmake ..",
+ "make report"
+ ]
+ }
+ ],
+ "windows": [
+ {
+ "id": "fpga_emu",
+ "steps": [
+ "cd src",
+ "ninja fpga_emu",
+ "onchip_memory_cache.fpga_emu.exe"
+ ]
+ },
+ {
+ "id": "report",
+ "steps": [
+ "cd src",
+ "ninja report"
+ ]
+ }
+ ]
+ }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/CMakeLists.txt
new file mode 100755
index 0000000000..9ed3cee584
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/CMakeLists.txt
@@ -0,0 +1,89 @@
+set(SOURCE_FILE onchip_memory_cache.cpp)
+set(TARGET_NAME onchip_memory_cache)
+
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+ MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+ MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+ MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+ SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+ MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga")
+
+# use cmake -D USER_HARDWARE_FLAGS= to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+# fpga emulator
+if(WIN32)
+ set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+ add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+ separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+ add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET}
+ COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+ DEPENDS ${SOURCE_FILE})
+else()
+ add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+ add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+ set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+ set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+
+# fpga
+if(WIN32)
+ add_custom_target(fpga
+ COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+ add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+ add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+ set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS})
+ set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS})
+endif()
+
+# generate report
+if(WIN32)
+ set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+ add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+ separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+ add_custom_command(OUTPUT ${DEVICE_OBJ_FILE}
+ COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+ DEPENDS ${SOURCE_FILE})
+
+else()
+ set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+ add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+
+ separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+ add_custom_command(OUTPUT ${DEVICE_OBJ_FILE}
+ COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+ DEPENDS ${SOURCE_FILE})
+endif()
+
+# run
+add_custom_target(run
+ COMMAND ../${TARGET_NAME}.fpga_emu
+ DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/build.ninja
new file mode 100755
index 0000000000..94d90e092c
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/build.ninja
@@ -0,0 +1,30 @@
+source_file = onchip_memory_cache.cpp
+target_name = onchip_memory_cache
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+ command = dpcpp /GX ${emulator_flags} $in -o $out
+
+rule gen_report
+ command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+ command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/onchip_memory_cache.cpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/onchip_memory_cache.cpp
new file mode 100755
index 0000000000..83b48eac97
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/onchip_memory_cache.cpp
@@ -0,0 +1,235 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include
+#include
+#include
+#include "dpc_common.hpp"
+
+using namespace sycl;
+
+constexpr int kInitNumInputs = 16 * 1024 * 1024; // Default number of inputs.
+constexpr int kNumOutputs = 64; // Number of outputs
+constexpr int kInitSeed = 42; // Seed for randomizing data inputs
+constexpr int kCacheDepth = 5; // Depth of the cache.
+constexpr int kNumRuns = 2; // runs twice to show the impact of cache
+constexpr double kNs = 1000000000.0; // number of nanoseconds in a second
+
+template
+class Task;
+
+// This kernel function implements two data paths: with and without caching.
+// use_cache specifies which path to take.
+template
+void Histogram(std::unique_ptr& q, buffer& input_buf,
+ buffer& output_buf, event& e) {
+ // Enqueue kernel
+ e = q->submit([&](handler& h) {
+ // Get accessors to the SYCL buffers
+ auto input = input_buf.get_access(h);
+ auto output = output_buf.get_access(h);
+
+ h.single_task>([=]() [[intel::kernel_args_restrict]] {
+
+ // On-chip memory for Histogram
+ uint32_t local_output[kNumOutputs];
+ uint32_t local_output_with_cache[kNumOutputs];
+
+ // Register-based cache of recently-accessed memory locations
+ uint32_t last_sum[kCacheDepth + 1];
+ uint32_t last_sum_index[kCacheDepth + 1];
+
+ // Initialize Histogram to zero
+ for (uint32_t b = 0; b < kNumOutputs; ++b) {
+ local_output[b] = 0;
+ local_output_with_cache[b] = 0;
+ }
+
+ // Compute the Histogram
+ if (!use_cache) { // Without cache
+ for (uint32_t n = 0; n < kInitNumInputs; ++n) {
+ // Compute the Histogram index to increment
+ uint32_t b = input[n] % kNumOutputs;
+ local_output[b]++;
+ }
+ } else { // With cache
+
+ // Specify that the minimum dependence-distance of
+ // loop carried variables is kCacheDepth.
+ [[intelfpga::ivdep(kCacheDepth)]] for (uint32_t n = 0;
+ n < kInitNumInputs; ++n) {
+ // Compute the Histogram index to increment
+ uint32_t b = input[n] % kNumOutputs;
+
+ // Get the value from the on-chip mem at this index.
+ uint32_t val = local_output_with_cache[b];
+
+ // However, if this location in on-chip mem was recently
+ // written to, take the value from the cache.
+ #pragma unroll
+ for (int i = 0; i < kCacheDepth + 1; i++) {
+ if (last_sum_index[i] == b) val = last_sum[i];
+ }
+
+ // Write the new value to both the cache and the on-chip mem.
+ last_sum[kCacheDepth] = local_output_with_cache[b] = val + 1;
+ last_sum_index[kCacheDepth] = b;
+
+ // Cache is just a shift register, so shift the shift reg. Pushing
+ // into the back of the shift reg is done above.
+ #pragma unroll
+ for (int i = 0; i < kCacheDepth; i++) {
+ last_sum[i] = last_sum[i + 1];
+ last_sum_index[i] = last_sum_index[i + 1];
+ }
+ }
+ }
+
+ // Write output to global memory
+ for (uint32_t b = 0; b < kNumOutputs; ++b) {
+ if (!use_cache) {
+ output[b] = local_output[b];
+ } else {
+ output[b] = local_output_with_cache[b];
+ }
+ }
+ });
+ });
+}
+
+int main() {
+ // Host and kernel profiling
+ event e;
+ ulong t1_kernel, t2_kernel;
+ double time_kernel;
+
+// Create queue, get platform and device
+#if defined(FPGA_EMULATOR)
+ intel::fpga_emulator_selector device_selector;
+ std::cout << "\nEmulator output does not demonstrate true hardware "
+ "performance. The design may need to run on actual hardware "
+ "to observe the performance benefit of the optimization "
+ "exemplified in this tutorial.\n\n";
+#else
+ intel::fpga_selector device_selector;
+#endif
+ try {
+ auto prop_list =
+ property_list{property::queue::enable_profiling()};
+
+ std::unique_ptr q;
+ q.reset(new queue(device_selector, dpc_common::exception_handler, prop_list));
+
+ platform platform = q->get_context().get_platform();
+ device device = q->get_device();
+ std::cout << "Platform name: "
+ << platform.get_info().c_str() << "\n";
+ std::cout << "Device name: "
+ << device.get_info().c_str() << "\n\n\n";
+
+ std::cout << "\nNumber of inputs: " << kInitNumInputs << "\n";
+ std::cout << "Number of outputs: " << kNumOutputs << "\n\n";
+
+ // Create input and output buffers
+ auto input_buf = buffer(range<1>(kInitNumInputs));
+ auto output_buf = buffer(range<1>(kNumOutputs));
+
+ srand(kInitSeed);
+
+ // Compute the reference solution
+ uint32_t gold[kNumOutputs];
+
+ {
+ // Get host-side accessors to the SYCL buffers
+ auto input_host = input_buf.get_access();
+ // Initialize random input
+ for (int i = 0; i < kInitNumInputs; ++i) {
+ input_host[i] = rand();
+ }
+
+ for (int b = 0; b < kNumOutputs; ++b) {
+ gold[b] = 0;
+ }
+ for (int i = 0; i < kInitNumInputs; ++i) {
+ int b = input_host[i] % kNumOutputs;
+ gold[b]++;
+ }
+ }
+
+ // Host accessor is now out-of-scope and is destructed. This is required
+ // in order to unblock the kernel's subsequent accessor to the same buffer.
+
+ for (int i = 0; i < kNumRuns; i++) {
+ switch (i) {
+ case 0: {
+ std::cout << "Beginning run without on-chip memory caching.\n\n";
+ Histogram