diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/CMakeLists.txt
new file mode 100755
index 0000000000..6ae6386d49
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(CRR)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/README.md b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/README.md
new file mode 100755
index 0000000000..ab98bae8d7
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/README.md
@@ -0,0 +1,224 @@
+# CRR Binomial Tree Model for Option Pricing
+An FPGA-optimized reference design computing the Cox-Ross-Rubinstein (CRR) binomial tree model with Greeks for American exercise options.
+
+The [FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a resource for general target-independent DPC++ programming.  Additional reference material specific to option pricing algorithms is provided in the References section of this README.
+
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC)  with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC)  with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit
+| What you will learn               | Review a high performance DPC++ design optimized for FPGA
+| Time to complete                  | 1 hr (not including compile time)
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+
+**Performance**
+Please refer to performance disclaimer at the end of this README.
+
+| Device                                         | Throughput
+|:---                                            |:---
+| Intel® PAC with Intel Arria® 10 GX FPGA        | 118 assets/s
+| Intel® PAC with Intel Stratix® 10 SX FPGA      | 243 assets/s
+
+
+## Purpose
+This sample implements the Cox-Ross-Rubinstein (CRR) binomial tree model that is used in the finance field for American exercise options with five Greeks (delta, gamma, theta, vega and rho). The simple idea is to model all possible assets price paths using a binomial tree.
+
+## Key Implementation Details
+
+### Design Inputs
+This design reads inputs from the `ordered_inputs.csv` file. The inputs are:
+
+| Input                             | Description
+---                                 |---
+| `n_steps`                         | Number of time steps in the binomial tree. The maximum `n_steps` in this design is 8189.
+| `cp`                              | -1 or 1 represents put and call options, respectively.
+| `spot`                            | Spot price of the underlying price.
+| `fwd`                             | Forward price of the underlying price.
+| `strike`                          | Exercise price of the option.
+| `vol`                             | Percent volatility that the design reads as a decimal value.
+| `df`                              | Discount factor to option expiry.
+| `t`                               | Time, in years, to the maturity of the option.
+
+### Design Outputs
+This design writes outputs to the `ordered_outputs.csv` file. The outputs are:
+
+| Output                            | Description
+---                                 |---
+| `value`                           | Option price
+| `delta`                           | Measures the rate of change of the theoretical option value with respect to changes in the underlying asset's price.
+| `gamma`                           | Measures the rate of change in the `delta` with respect to changes in the underlying price.
+| `vega`                            | Measures sensitivity to volatility.
+| `theta`                           | Measures the sensitivity of the value of the derivative to the passage of time.
+| `rho`                             | Measures sensitivity to the interest of rate.
+
+### Design Correctness
+This design tests the correctness of the optimized FPGA code by comparing its output to a golden result computed on the CPU.
+
+### Design Performance
+This design measures the FPGA performance to determine how many assets can be processed per second.
+
+## License
+This code sample is licensed under MIT license.
+
+## Building the CRR Program 
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (FPGA) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 48h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+ 
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+ 
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+ 
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the optimization report: 
+     ```
+     make report
+     ``` 
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/crr.fpga.tar.gz" download>here</a>.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. 
+ 
+1. Enter the source file directory.
+   ```
+   cd src
+   ```
+ 
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+ 
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      ninja fpga_emu
+      ```
+ 
+   * Generate the optimization report:
+ 
+     ```
+     ninja report
+     ```
+     If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+     ```
+     ninja report_s10_pac
+     ```     
+   * Compiling for FPGA hardware is not yet supported on Windows.
+
+ ### In Third-Party Integrated Development Environments (IDEs)
+ 
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Running the Reference Design
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./crr.fpga_emu <input_file> [-o=<output_file>]                           (Linux)
+
+     crr.fpga_emu.exe <input_file> [-o=<output_file>]                         (Windows)
+     ```
+ 2. Run the sample on the FPGA device:
+     ```
+     ./crr.fpga <input_file> [-o=<output_file>]                               (Linux)
+     ```
+
+### Application Parameters
+
+| Argument                          | Description
+---                                 |---
+| `<input_file>`                    | Optional argument that provides the input data. The default file is `/data/ordered_inputs.csv`
+| `-o=<output_file>`                | Optional argument that specifies the name of the output file. The default name of the output file is `ordered_outputs.csv`.
+
+### Example of Output
+```
+============ Correctness Test =============
+Running analytical correctness checks...
+CPU-FPGA Equivalence: PASS
+
+============ Throughput Test =============
+Avg throughput: 66.2 assets/s
+```
+
+## Additional Design Information
+
+### Source Code Explanation
+
+| File                              | Description
+---                                 |---
+| `main.cpp`                        | Contains both host code and SYCL* kernel code.
+| `CRR_common.hpp`                  | Header file for `main.cpp`. Contains the data structures needed for both host code and SYCL* kernel code.
+
+  
+
+### Backend Compiler Flags Used
+
+| Flag                              | Description
+---                                 |---
+`-Xshardware`                       | Target FPGA hardware (as opposed to FPGA emulator)
+`-Xsdaz`                            | Denormals are zero
+`-Xsrounding=faithful`              | Rounds results to either the upper or lower nearest single-precision numbers
+`-Xsparallel=2`                     | Uses 2 cores when compiling the bitstream through Quartus
+`-Xsseed=2`                         | Uses seed 2 during Quartus, yields slightly higher f<sub>MAX</sub>
+
+### Preprocessor Define Flags 
+
+| Flag                              | Description
+---                                 |---
+`-DOUTER_UNROLL=1`                  | Uses the value 1 for the constant OUTER_UNROLL, controls the number of CRRs that can be processed in parallel
+`-DINNER_UNROLL=64`                 | Uses the value 64 for the constant INNER_UNROLL, controls the degree of parallelization within the calculation of 1 CRR
+`-DOUTER_UNROLL_POW2=1`             | Uses the value 1 for the constant OUTER_UNROLL_POW2, controls the number of memory banks
+
+
+NOTE: The Xsseed, DOUTER_UNROLL, DINNER_UNROLL and DOUTER_UNROLL_POW2 values differ depending on the board being targeted. More information about the unroll factors can be found in `/src/CRR_common.hpp`.
+
+### Performance disclaimers
+
+Tests document performance of components on a particular test, in specific systems. Differences in hardware, software, or configuration will affect actual performance. Consult other sources of information to evaluate performance as you consider your purchase. For more complete information about performance and benchmark results, visit [www.intel.com/benchmarks](www.intel.com/benchmarks).
+
+Performance results are based on testing as of July 20, 2020 and may not reflect all publicly available security updates. See configuration disclosure for details. No product or component can be absolutely secure.
+
+Intel technologies’ features and benefits depend on system configuration and may require enabled hardware, software or service activation. Performance varies depending on system configuration. Check with your system manufacturer or retailer or learn more at [intel.com](www.intel.com).
+
+The performance was measured by Intel on July 20, 2020
+
+Intel and the Intel logo are trademarks of Intel Corporation or its subsidiaries in the U.S. and/or other countries.
+
+(C) Intel Corporation.
+
+### References
+
+[Khronous SYCL Resources](https://www.khronos.org/sycl/resources)
+
+[Binomial options pricing model](https://en.wikipedia.org/wiki/Binomial_options_pricing_model)
+
+[Wike page for finance Greeks](https://en.wikipedia.org/wiki/Greeks_(finance))
+
+[OpenCL Intercept Layer](https://github.com/intel/opencl-intercept-layer)  
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.sln b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.sln
new file mode 100755
index 0000000000..a95fce9c30
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "crr", "crr.vcxproj", "{8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}.Debug|x64.ActiveCfg = Debug|x64
+		{8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}.Debug|x64.Build.0 = Debug|x64
+		{8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}.Release|x64.ActiveCfg = Release|x64
+		{8EB512FF-4487-4FEC-9B88-8C0DA734B1B2}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {6887ACDD-3E54-4396-A921-99C630333932}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj
new file mode 100755
index 0000000000..62a523e96c
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj
@@ -0,0 +1,165 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="src\CRR_common.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\main.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+    <None Include="src\data\ordered_inputs.csv" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{8eb512ff-4487-4fec-9b88-8c0da734b1b2}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>crr</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR -DOUTER_UNROLL=1 -DINNER_UNROLL=64 -DOUTER_UNROLL_POW2=1 %(AdditionalOptions)</AdditionalOptions>
+      <SYCLShowVerboseInformation>false</SYCLShowVerboseInformation>
+      <ObjectFileName>$(IntDir)crr.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR -DOUTER_UNROLL=1 -DINNER_UNROLL=64 -DOUTER_UNROLL_POW2=1 %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)crr.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj.user b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj.user
new file mode 100755
index 0000000000..9115b3f275
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/crr.vcxproj.user
@@ -0,0 +1,14 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <ShowAllFiles>false</ShowAllFiles>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LocalDebuggerCommandArguments>./src/data/ordered_inputs.csv -o=./src/data/ordered_outputs.csv</LocalDebuggerCommandArguments>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LocalDebuggerCommandArguments>./src/data/ordered_inputs.csv -o=./src/data/ordered_outputs.csv</LocalDebuggerCommandArguments>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+</Project>
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/sample.json b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/sample.json
new file mode 100755
index 0000000000..6155ce223d
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/sample.json
@@ -0,0 +1,51 @@
+{
+  "guid": "D725E06E-0ECE-44F8-910D-AD1A8C89ED89",
+  "name": "CRR Binomial Tree Model for Option Pricing",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Reference Designs"],
+  "description": "FPGA-optimized reference design of the Cox-Ross-Rubinstein (CRR) binomial tree model with Greeks for American exercise options",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "builder": ["ide", "cmake"],
+  "targetDevice": ["FPGA"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./crr.fpga_emu ./src/data/ordered_inputs.csv -o=./src/data/ordered_outputs.csv"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "crr.fpga_emu.exe ./data/ordered_inputs.csv -o=./data/ordered_outputs.csv"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CMakeLists.txt
new file mode 100755
index 0000000000..8c56a699ad
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CMakeLists.txt
@@ -0,0 +1,116 @@
+set(SOURCE_FILE main.cpp)
+set(TARGET_NAME crr)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+set(REPORTS_TARGET ${TARGET_NAME}_report)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Design specific constant values
+set(OUTER_UNROLL_A10 1)
+set(INNER_UNROLL_A10 64)
+set(OUTER_UNROLL_POW2_A10 1)
+set(OUTER_UNROLL_S10 2)
+set(INNER_UNROLL_S10 64)
+set(OUTER_UNROLL_POW2_S10 2)
+set(SEED_A10 1)
+set(SEED_S10 2)
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+SET(OUTER_UNROLL ${OUTER_UNROLL_A10})
+SET(INNER_UNROLL ${INNER_UNROLL_A10})
+SET(OUTER_UNROLL_POW2 ${OUTER_UNROLL_POW2_A10})
+SET(SEED ${SEED_A10})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+    SET(OUTER_UNROLL ${OUTER_UNROLL_S10})
+    SET(INNER_UNROLL ${INNER_UNROLL_S10})
+    SET(OUTER_UNROLL_POW2 ${OUTER_UNROLL_POW2_S10})
+    SET(SEED ${SEED_S10})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS -fintelfpga -c -DOUTER_UNROLL=${OUTER_UNROLL} -DINNER_UNROLL=${INNER_UNROLL} -DOUTER_UNROLL_POW2=${OUTER_UNROLL_POW2})
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+separate_arguments(USER_HARDWARE_FLAGS)
+set(HARDWARE_LINK_FLAGS -fintelfpga -Xshardware -Xsdaz -Xsrounding=faithful -Xsparallel=2 -Xsseed=${SEED} -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS} -DOUTER_UNROLL=${OUTER_UNROLL} -DINNER_UNROLL=${INNER_UNROLL} -DOUTER_UNROLL_POW2=${OUTER_UNROLL_POW2})
+set(FINAL_LINK_FLAGS -fintelfpga -DOUTER_UNROLL=${OUTER_UNROLL} -DINNER_UNROLL=${INNER_UNROLL} -DOUTER_UNROLL_POW2=${OUTER_UNROLL_POW2})
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR -DOUTER_UNROLL=${OUTER_UNROLL} -DINNER_UNROLL=${INNER_UNROLL} -DOUTER_UNROLL_POW2=${OUTER_UNROLL_POW2}")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+#copy input data
+configure_file("data/ordered_inputs.csv" "data/ordered_inputs.csv" COPYONLY)
+
+# fpga emulator
+if(WIN32)
+    set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+    add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+    separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+    add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+                   COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+                   DEPENDS ${SOURCE_FILE})
+else()
+    add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+    add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpgas
+if(WIN32)
+    add_custom_target(fpga
+                  COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+    set(DEVICE_FPGA_OBJ "crr_fpga.o")
+
+    add_custom_command(OUTPUT ${DEVICE_FPGA_OBJ}
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${DEVICE_FPGA_OBJ}
+                       DEPENDS ${SOURCE_FILE})
+
+    add_custom_command(OUTPUT ${FPGA_TARGET}
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} ${DEVICE_FPGA_OBJ} -o  ${CMAKE_BINARY_DIR}/${FPGA_TARGET}
+                   DEPENDS ${DEVICE_FPGA_OBJ})
+endif()
+                   
+# fpga report
+if(WIN32)
+    add_custom_target(report DEPENDS ${REPORTS_TARGET} )
+
+    separate_arguments(WIN_FLAGS WINDOWS_COMMAND)
+    add_custom_command(OUTPUT ${REPORTS_TARGET}
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${WIN_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET}
+                      DEPENDS ${SOURCE_FILE})
+
+else()
+    add_custom_target(report DEPENDS ${REPORTS_TARGET} )
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/CRR_common.hpp CRR_common.hpp COPYONLY)
+
+    add_custom_command(OUTPUT ${REPORTS_TARGET}
+                      COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET}
+                      DEPENDS ${SOURCE_FILE} CRR_common.hpp)
+endif()
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}.fpga_emu data/ordered_inputs.csv -o=data/ordered_output.csv
+                  DEPENDS ${TARGET_NAME}.fpga_emu)
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CRR_common.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CRR_common.hpp
new file mode 100755
index 0000000000..6f2537e1e0
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/CRR_common.hpp
@@ -0,0 +1,149 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#ifndef __CRR_COMMON_H__
+#define __CRR_COMMON_H__
+
+constexpr int kMaxStringLen = 1024;
+
+// Increments of kMaxNSteps
+constexpr size_t kMaxNSteps  = 8189;
+constexpr size_t kMaxNSteps1 = 8190;
+constexpr size_t kMaxNSteps2 = 8191;
+constexpr size_t kMaxNSteps3 = 8192;
+
+// Increment by a small epsilon in order to compute derivative 
+// of option price with respect to Vol or Interest. The derivatives
+// are then used to compute Vega and Rho. 
+constexpr double kEpsilon  = 0.0001;
+
+// Whenever calculations are made for Option Price 0, need to increment
+// nsteps by 2 to ensure all the required derivative prices are calculated.
+constexpr size_t kOpt0 = 2;
+
+
+// Solver configuration settings that are dependent on selected
+// board. Most notable settings are:
+
+// OUTER_UNROLL controls the number of CRRs that can be processed
+// in parallel in a SIMD fashion (number of CRRS must be >= OUTER_UNROLL). 
+// This is ideally a power of two, but does not have to be. Since 
+// the DRAM bandwidth requirement is low, increasing OUTER_UNROLL 
+// should result in fairly linear speedup. (max: 32 on PAC A10)
+
+// INNER_UNROLL controls the degree of parallelization within
+// the calculation of a single CRR. This must be a power of two. Increasing
+// INNER_UNROLL has a lower area overhead than increasing OUTER_UNROLL;
+// however, there are diminishing returns as INNER_UNROLL is increased with
+// respect to the number of time steps. (max: 128 on PAC A10)
+
+
+// Data structure for original input data.
+typedef struct {
+  int cp;         /* cp = -1 or 1 for Put & Call respectively. */
+  double n_steps; /* n_steps = number of time steps in the binomial tree. */
+  double strike;  /* strike = exercise price of option. */
+  double spot;    /* spot = spot price of the underlying. */
+  double fwd;     /* fwd = forward price of the underlying. */
+  double vol;     /* vol = per cent volatility, input as a decimal. */
+  double df;      /* df = discount factor to option expiry. */
+  double t;       /* t = time in years to the maturity of the option. */
+
+} InputData;
+
+// Data structure as the inputs to FPGA.
+// Element[i] is used to compute option_price[i]. 
+typedef struct {
+  double n_steps;   /* n_steps = number of time steps in the binomial tree. */
+  double u[3];      /* u = the increase factor of a up movement in the binomial tree,
+                       same for each time step. */
+  double u2[3];     /* u2 = the square of increase factor. */
+  double c1[3];     /* c1 = the probality of a down movement in the binomial tree,
+                       same for each time step. */
+  double c2[3];     /* c2 = the probality of a up movement in the binomial tree. */
+  double umin[3];   /* umin = minimum price of the underlying at the maturity. */
+  double param_1[3];/* param_1[i] = cp * umin[i] */ 
+  double param_2;   /* param_2 = cp * strike */
+
+} CRRInParams;
+
+// Data structure as the output from ProcessKernelResult().
+typedef struct {
+  double pgreek[4]; /* Stores the 4 derivative prices in the binomial tree 
+                       required to compute the Premium and Greeks. */
+  double vals[3];   /* Three option prices calculated */
+
+} InterRes;
+
+// Data structure for option price and five Greeks.
+typedef struct {
+  double value; /* value = option price. */
+  double delta;
+  double gamma;
+  double vega;
+  double theta;
+  double rho;
+} OutputRes;
+
+// Data structures required by the kernel
+typedef struct {
+  double u;
+  double c1;
+  double c2;
+  double param_1;
+  double param_2;
+  short n_steps;
+  short pad1;
+  int pad2;
+  double pad3;
+  double pad4;
+} CRRMeta;
+
+typedef struct {
+  double u2;
+  double p1powu;
+  double init_optval;
+  double pad;
+} ArrayEle;
+
+typedef struct {
+  ArrayEle array_eles[kMaxNSteps3][3]; /* Second dimension size set to 3 to have a 
+                                          separate ArrayEle for each option price */
+} CRRArrayEles;
+
+typedef struct {
+  ArrayEle array_eles[kMaxNSteps3];
+} CRRPerStepMeta;
+
+typedef struct {
+  double pgreek[4];
+  double optval0;
+  double pad[3];
+} CRRResParams;
+
+#endif
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/build.ninja b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/build.ninja
new file mode 100755
index 0000000000..58af917f67
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/build.ninja
@@ -0,0 +1,35 @@
+source_file = main.cpp
+target_name = crr
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware -Xsfpc -Xsparallel=2 -Xsseed=5
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+a10_flags      = -DOUTER_UNROLL=1 -DINNER_UNROLL=64 -DOUTER_UNROLL_POW2=1
+s10_flags      = -DOUTER_UNROLL=2 -DINNER_UNROLL=64 -DOUTER_UNROLL_POW2=2
+
+rule build_fpga_emu
+  command = dpcpp /GX ${emulator_flags} ${a10_flags} $in -o $out
+
+rule build_fpga_emu_s10
+  command = dpcpp /GX ${emulator_flags} -Xsboard=intel_s10sx_pac:pac_s10 ${s10_flags} $in -o $out
+
+rule gen_report
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 ${a10_flags} -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 ${s10_flags} -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/data/ordered_inputs.csv b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/data/ordered_inputs.csv
new file mode 100755
index 0000000000..3a28083fa2
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/data/ordered_inputs.csv
@@ -0,0 +1,10 @@
+8189,-1,37.5,37.50112053,85,0.4,0.99997012,0.011952191
+8189,1,37.5,37.50112053,85,0.4,0.99997012,0.011952191
+8189,-1,270,270.0080678,65,0.18,0.999940241,0.011952191
+8189,1,270,270.0080678,65,0.18,0.999940241,0.011952191
+8189,-1,292.5,292.5087402,70,0.35,0.999940241,0.011952191
+8189,1,292.5,292.5087402,70,0.35,0.999940241,0.011952191
+8189,-1,122.5,122.5109816,40,0.2,0.999910363,0.011952191
+8189,1,122.5,122.5109816,40,0.2,0.999910363,0.011952191
+8189,-1,22.5,22.50067232,55,0.3,0.999910363,0.011952191
+8189,1,22.5,22.50067232,55,0.3,0.999910363,0.011952191
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/main.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/main.cpp
new file mode 100755
index 0000000000..7c92610e19
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/crr/src/main.cpp
@@ -0,0 +1,849 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// CRRSolver CPU/FPGA Accelerator Demo Program
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// This design implments simple Cox-Ross-Rubinstein(CRR) binomial tree model
+// with Greeks for American exercise options.
+//
+//
+// Optimization summary:
+//    -- Area-consuming but infrequent calculation is done on CPU.
+//    -- Parallelize the calculation of a single CRR.
+//    -- Run multiple independent CRRs in parallel.
+//    -- Optimized memory configurations to reduce the need for replication
+//       and to eliminate the need for double-pumping M20Ks.
+//
+// The following diagram shows the mechanism of optimizations to CRR.
+//
+//
+//                                               +------+         ^
+//                                 +------------>|optval|         |
+//                                 |             | [2]  |         |
+//                                 |             +------+         |
+//                                 |                              |
+//                                 |                              |
+//                              +--+---+                          |
+//                +------------>|optval|                          |
+//                |             | [1]  |                          |
+//                |             +--+---+                          |
+//                |                |                              |
+//                |                |                              |
+//                |                |                              |   Loop4(L4)
+//                |                |                              |   updates
+//            +---+--+             +------------>+------+         |   multiple
+//            |optval|                           |optval|         |   elements
+//            | [0]  |                           | [1]  |         |   in optval[]
+//            +---+--+             +------------>+------+         |   simultaneously
+//                |                |                              |
+//                |                |                              |
+//                |                |                              |
+//                |                |                              |
+//                |             +--+---+                          |
+//                |             |optval|                          |
+//                +------------>| [0]  |                          |
+//                              +--+---+                          |
+//                                 |                              |
+//                                 |                              |
+//                                 |             +------+         |
+//                                 |             |optval|         |
+//                                 +------------>| [0]  |         |
+//                                               +------+         +
+//
+//
+//
+//
+//                              step 1           step 2
+//
+//
+//                <------------------------------------------+
+//                  Loop3(L3) updates each level of the tree
+//
+//
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <cstddef>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <regex>
+#include <sstream>
+#include <string>
+
+#include "CRR_common.hpp"
+#include "dpc_common.hpp"
+
+using namespace std;
+using namespace sycl;
+
+class CRRSolver;
+double CrrSolver(const int n_items, vector<CRRMeta> &in_params,
+                  vector<CRRResParams> &res_params,
+                  vector<CRRPerStepMeta> &in_params2, queue &q) {
+  dpc_common::TimeInterval timer;
+
+  constexpr int steps = kMaxNSteps2;
+
+  const int n_crr =
+      (((n_items + (OUTER_UNROLL - 1)) / OUTER_UNROLL) * OUTER_UNROLL) * 3;
+
+  {
+    buffer<CRRMeta, 1> i_params(in_params.data(), in_params.size());
+    buffer<CRRResParams, 1> r_params(res_params.data(), res_params.size());
+    buffer<CRRPerStepMeta, 1> a_params(in_params2.data(), in_params2.size());
+
+    event e;
+    {
+      e = q.submit([&](handler &h) {
+        auto accessor_v =
+            i_params.template get_access<access::mode::read_write>(h);
+
+        auto accessor_v2 =
+            a_params.template get_access<access::mode::read_write>(h);
+
+        auto accessor_r =
+            r_params.template get_access<access::mode::discard_write>(h);
+
+        h.single_task<CRRSolver>([=]() [[intel::kernel_args_restrict]] {
+          // Kernel requires n_crr to be a multiple of OUTER_UNROLL.
+          // This is taken care of by the host.
+          const int n_crr_div = n_crr / OUTER_UNROLL;
+
+          // Outerloop counter. Use while-loop for better timing-closure
+          // characteristics because it tells the compiler the loop body will
+          // never be skipped.
+          int oc = 0;
+          do {
+            // Metadata of CRR problems
+            [[intelfpga::register]] double u[OUTER_UNROLL];
+            [[intelfpga::register]] double c1[OUTER_UNROLL];
+            [[intelfpga::register]] double c2[OUTER_UNROLL];
+            [[intelfpga::register]] double param_1[OUTER_UNROLL];
+            [[intelfpga::register]] double param_2[OUTER_UNROLL];
+            [[intelfpga::register]] short n_steps[OUTER_UNROLL];
+
+            // Current values in binomial tree.  We only need to keep track of
+            // one level worth of data, not the entire tree.
+            [[intelfpga::memory, intelfpga::singlepump,
+              intelfpga::bankwidth(sizeof(double)),
+              intelfpga::numbanks(INNER_UNROLL * OUTER_UNROLL_POW2),
+              intelfpga::private_copies(
+                  8)]] double optval[kMaxNSteps3][OUTER_UNROLL_POW2];
+
+            // Initial values in binomial tree, which correspond to the last
+            // level of the binomial tree.
+            [[intelfpga::memory, intelfpga::singlepump,
+              intelfpga::bankwidth(sizeof(double)),
+              intelfpga::numbanks(INNER_UNROLL * OUTER_UNROLL_POW2),
+              intelfpga::private_copies(
+                  8)]] double init_optval[kMaxNSteps3][OUTER_UNROLL_POW2];
+
+            // u2_array precalculates the power function of u2.
+            [[intelfpga::memory, intelfpga::singlepump,
+              intelfpga::bankwidth(sizeof(double)),
+              intelfpga::numbanks(INNER_UNROLL * OUTER_UNROLL_POW2),
+              intelfpga::private_copies(
+                  8)]] double u2_array[kMaxNSteps3][OUTER_UNROLL_POW2];
+
+            // p1powu_array precalculates p1 multipy the power of u.
+            [[intelfpga::memory, intelfpga::singlepump,
+              intelfpga::bankwidth(sizeof(double)),
+              intelfpga::numbanks(INNER_UNROLL * OUTER_UNROLL_POW2),
+              intelfpga::private_copies(
+                  8)]] double p1powu_array[kMaxNSteps3][OUTER_UNROLL_POW2];
+
+            // n0_optval stores the binomial tree value corresponding to node 0
+            // of a level. This is the same as what's stored in
+            // optval/init_optval, but replicating this data allows us to have
+            // only one read port for optval and init_optval, thereby removing
+            // the need of double-pumping or replication. n0_optval_2 is a copy
+            // of n0_optval that stores the node 0 value for a specific layer of
+            // the tree. pgreek is the array saving values for post-calculating
+            // Greeks.
+            [[intelfpga::register]] double n0_optval[OUTER_UNROLL];
+            [[intelfpga::register]] double n0_optval_2[OUTER_UNROLL];
+            [[intelfpga::register]] double pgreek[4][OUTER_UNROLL];
+
+            // L1 + L2:
+            // Populate init_optval -- calculate the last level of the binomial
+            // tree.
+            for (short ic = 0; ic < OUTER_UNROLL; ++ic) {
+              // Transfer data from DRAM to local memory or registers
+              const int c = oc * OUTER_UNROLL + ic;
+              const CRRMeta param = accessor_v[c];
+
+              u[ic] = param.u;
+              c1[ic] = param.c1;
+              c2[ic] = param.c2;
+              param_1[ic] = param.param_1;
+              param_2[ic] = param.param_2;
+              n_steps[ic] = param.n_steps;
+
+              for (short t = steps; t >= 0; --t) {
+                const ArrayEle param_array = accessor_v2[c].array_eles[t];
+
+                const double init_val = param_array.init_optval;
+
+                init_optval[t][ic] = init_val;
+
+                // n0_optval intends to store the node value at t == 0.
+                // Instead of qualifying this statement by an "if (t == 0)",
+                // which couples the loop counter to the timing path of the
+                // assignment, we reverse the loop direction so the last value
+                // stored corresponds to t == 0.
+                n0_optval[ic] = init_val;
+
+                // Transfer data from DRAM to local memory or registers
+                u2_array[t][ic] = param_array.u2;
+                p1powu_array[t][ic] = param_array.p1powu;
+              }
+            }
+
+            // L3:
+            // Update optval[] -- calculate each level of the binomial tree.
+            // reg[] helps to achieve updating INNER_UNROLL elements in optval[]
+            // simultaneously.
+            [[intelfpga::disable_loop_pipelining]] for (short t = 0;
+                                                        t <= steps - 1; ++t) {
+              [[intelfpga::register]] double reg[INNER_UNROLL + 1][OUTER_UNROLL];
+
+              double val_1, val_2;
+
+              #pragma unroll
+              for (short ic = 0; ic < OUTER_UNROLL; ++ic) {
+                reg[0][ic] = n0_optval[ic];
+              }
+
+              // L4:
+              // Calculate all the elements in optval[] -- all the tree nodes
+              // for one level of the tree
+              [[intelfpga::ivdep]] for (int n = 0; n <= steps - 1 - t;
+                                        n += INNER_UNROLL) {
+
+                #pragma unroll
+                for (short ic = 0; ic < OUTER_UNROLL; ++ic) {
+
+                  #pragma unroll
+                  for (short ri = 1; ri <= INNER_UNROLL; ++ri) {
+                    reg[ri][ic] =
+                        (t == 0) ? init_optval[n + ri][ic] : optval[n + ri][ic];
+                  }
+
+                  #pragma unroll
+                  for (short ri = 0; ri < INNER_UNROLL; ++ri) {
+                    const double val = sycl::fmax(
+                        c1[ic] * reg[ri][ic] + c2[ic] * reg[ri + 1][ic],
+                        p1powu_array[t][ic] * u2_array[n + ri][ic] -
+                            param_2[ic]);
+
+                    optval[n + ri][ic] = val;
+                    if (n + ri == 0) {
+                      n0_optval[ic] = val;
+                    }
+                    if (n + ri == 1) {
+                      val_1 = val;
+                    }
+                    if (n + ri == 2) {
+                      val_2 = val;
+                    }
+                  }
+
+                  reg[0][ic] = reg[INNER_UNROLL][ic];
+
+                  if (t == steps - 5) {
+                    pgreek[3][ic] = val_2;
+                  }
+                  if (t == steps - 3) {
+                    pgreek[0][ic] = n0_optval[ic];
+                    pgreek[1][ic] = val_1;
+                    pgreek[2][ic] = val_2;
+                    n0_optval_2[ic] = n0_optval[ic];
+                  }
+                }
+              }
+            }
+
+            // L5: transfer crr_res_paramss to DRAM
+            #pragma unroll
+            for (short ic = 0; ic < OUTER_UNROLL; ++ic) {
+              const int c = oc * OUTER_UNROLL + ic;
+              if (n_steps[ic] < steps) {
+                accessor_r[c].optval0 = n0_optval_2[ic];
+              } else {
+                accessor_r[c].optval0 = n0_optval[ic];
+              }
+              accessor_r[c].pgreek[0] = pgreek[0][ic];
+              accessor_r[c].pgreek[1] = pgreek[1][ic];
+              accessor_r[c].pgreek[2] = pgreek[2][ic];
+              accessor_r[c].pgreek[3] = pgreek[3][ic];
+            }
+            // Increment counters
+            oc += 1;
+          } while (oc < n_crr_div);
+        });
+      });
+    }
+  }
+  
+  double diff = timer.Elapsed();
+  return diff;
+}
+
+void ReadInputFromFile(ifstream &input_file, vector<InputData> &inp) {
+  string line_of_args;
+  while (getline(input_file, line_of_args)) {
+    InputData temp;
+    istringstream line_of_args_ss(line_of_args);
+    line_of_args_ss >> temp.n_steps;
+    line_of_args_ss.ignore(1, ',');
+    line_of_args_ss >> temp.cp;
+    line_of_args_ss.ignore(1, ',');
+    line_of_args_ss >> temp.spot;
+    line_of_args_ss.ignore(1, ',');
+    line_of_args_ss >> temp.fwd;
+    line_of_args_ss.ignore(1, ',');
+    line_of_args_ss >> temp.strike;
+    line_of_args_ss.ignore(1, ',');
+    line_of_args_ss >> temp.vol;
+    line_of_args_ss.ignore(1, ',');
+    line_of_args_ss >> temp.df;
+    line_of_args_ss.ignore(1, ',');
+    line_of_args_ss >> temp.t;
+
+    inp.push_back(temp);
+  }
+}
+
+static string ToStringWithPrecision(const double value, const int p = 6) {
+  ostringstream out;
+  out.precision(p);
+  out << std::fixed << value;
+  return out.str();
+}
+
+void WriteOutputToFile(ofstream &output_file, const vector<OutputRes> &outp) {
+  size_t n = outp.size();
+  for (size_t i = 0; i < n; ++i) {
+    OutputRes temp;
+    temp = outp[i];
+    string line = ToStringWithPrecision(temp.value, 12) + " " +
+                  ToStringWithPrecision(temp.delta, 12) + " " +
+                  ToStringWithPrecision(temp.gamma, 12) + " " +
+                  ToStringWithPrecision(temp.vega, 12) + " " +
+                  ToStringWithPrecision(temp.theta, 12) + " " +
+                  ToStringWithPrecision(temp.rho, 12) + "\n";
+
+    output_file << line;
+  }
+}
+
+bool FindGetArgString(const string &arg, const char *str, char *str_value,
+                      size_t maxchars) {
+  size_t found = arg.find(str, 0, strlen(str));
+  if (found != string::npos) {
+    const char *sptr = &arg.c_str()[strlen(str)];
+    for (int i = 0; i < maxchars - 1; i++) {
+      char ch = sptr[i];
+      switch (ch) {
+        case ' ':
+        case '\t':
+        case '\0':
+          str_value[i] = 0;
+          return true;
+          break;
+        default:
+          str_value[i] = ch;
+          break;
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+// Perform data pre-processing work
+// Three different option prices are required to solve each CRR problem
+// The following lists why each option price is required:
+// [0] : Used to compute Premium, Delta, Gamma and Theta
+// [1] : Used to compute Rho
+// [2] : Used to compute Vega
+CRRInParams PrepareData(const InputData &inp) {
+  CRRInParams in_params;
+  in_params.n_steps = inp.n_steps;
+
+  double r[2];
+  r[0] = pow(inp.df, 1.0 / inp.n_steps);
+  double d_df = exp(-inp.t * kEpsilon);
+  r[1] = pow(inp.df * d_df, 1.0 / inp.n_steps);
+  in_params.u[0] = exp(inp.vol * sqrt(inp.t / inp.n_steps));
+  in_params.u[1] = in_params.u[0];
+  in_params.u[2] = exp((inp.vol + kEpsilon) * sqrt(inp.t / inp.n_steps));
+
+  in_params.u2[0] = in_params.u[0] * in_params.u[0];
+  in_params.u2[1] = in_params.u[1] * in_params.u[1];
+  in_params.u2[2] = in_params.u[2] * in_params.u[2];
+  in_params.umin[0] = inp.spot * pow(1 / in_params.u[0], inp.n_steps + kOpt0);
+  in_params.umin[1] = inp.spot * pow(1 / in_params.u[1], inp.n_steps);
+  in_params.umin[2] = inp.spot * pow(1 / in_params.u[2], inp.n_steps);
+  in_params.c1[0] =
+      r[0] * (in_params.u[0] - pow(inp.fwd / inp.spot, 1.0 / inp.n_steps)) /
+      (in_params.u[0] - 1 / in_params.u[0]);
+  in_params.c1[1] =
+      r[1] *(in_params.u[1] - pow((inp.fwd / d_df) / inp.spot, 1.0 / inp.n_steps)) /
+      (in_params.u[1] - 1 / in_params.u[1]);
+  in_params.c1[2] =
+      r[0] * (in_params.u[2] - pow(inp.fwd / inp.spot, 1.0 / inp.n_steps)) /
+      (in_params.u[2] - 1 / in_params.u[2]);
+  in_params.c2[0] = r[0] - in_params.c1[0];
+  in_params.c2[1] = r[1] - in_params.c1[1];
+  in_params.c2[2] = r[0] - in_params.c1[2];
+
+  in_params.param_1[0] = inp.cp * in_params.umin[0];
+  in_params.param_1[1] = inp.cp * in_params.umin[1];
+  in_params.param_1[2] = inp.cp * in_params.umin[2];
+  in_params.param_2 = inp.cp * inp.strike;
+
+  return in_params;
+}
+
+CRRArrayEles PrepareArrData(const CRRInParams &in) {
+  CRRArrayEles arr;
+
+  // Write in reverse t-direction to match kernel access pattern
+  for (int i = 0; i <= in.n_steps + kOpt0; ++i) {
+    for (int inner_func_index = 0; inner_func_index < 3; ++inner_func_index) {
+      arr.array_eles[i][inner_func_index].u2 = pow(in.u2[inner_func_index], i);
+      arr.array_eles[i][inner_func_index].p1powu =
+          in.param_1[inner_func_index] * pow(in.u[inner_func_index], i + 1);
+      arr.array_eles[i][inner_func_index].init_optval =
+          fmax(in.param_1[inner_func_index] * pow(in.u2[inner_func_index], i) -
+                   in.param_2, 0.0);
+    }
+  }
+
+  return arr;
+}
+
+// Metadata, used in the Kernel, is generated from the input data
+// Each CRR problem is split into 3 subproblems to calculate
+// each required option price separately
+void PrepareKernelData(vector<CRRInParams> &in_params,
+                       vector<CRRArrayEles> &array_params,
+                       vector<CRRMeta> &in_buff_params,
+                       vector<CRRPerStepMeta> &in_buff2_params,
+                       const int n_crrs) {
+
+  constexpr short offset = 0;
+
+  for (int wi_idx = offset, dst = offset * 3; wi_idx < n_crrs; ++wi_idx) {
+    CRRInParams &src_crr_params = in_params[wi_idx];
+
+    CRRArrayEles &src_crr_eles = array_params[wi_idx];
+
+    for (int inner_func_index = 0; inner_func_index < 3;
+         ++inner_func_index, ++dst) {
+      CRRMeta &dst_crr_meta = in_buff_params[dst];
+      CRRPerStepMeta &dst_crr_per_step_meta = in_buff2_params[dst];
+
+      dst_crr_meta.u = src_crr_params.u[inner_func_index];
+      dst_crr_meta.c1 = src_crr_params.c1[inner_func_index];
+      dst_crr_meta.c2 = src_crr_params.c2[inner_func_index];
+
+      dst_crr_meta.param_1 = src_crr_params.param_1[inner_func_index];
+      dst_crr_meta.param_2 = src_crr_params.param_2;
+
+      if (inner_func_index == 0) {
+        dst_crr_meta.n_steps = src_crr_params.n_steps + kOpt0;
+      } else {
+        dst_crr_meta.n_steps = src_crr_params.n_steps;
+      }
+      for (int i = 0; i <= kMaxNSteps2; ++i) {
+        dst_crr_per_step_meta.array_eles[i].u2 =
+            src_crr_eles.array_eles[i][inner_func_index].u2;
+        dst_crr_per_step_meta.array_eles[i].p1powu =
+            src_crr_eles.array_eles[i][inner_func_index].p1powu;
+        dst_crr_per_step_meta.array_eles[i].init_optval =
+            src_crr_eles.array_eles[i][inner_func_index].init_optval;
+      }
+    }
+  }
+}
+
+// Takes in the result from the kernel and stores the 3 option prices 
+// belonging to the same CRR problem in one InterRes element
+void ProcessKernelResult(const vector<CRRResParams> &res_params,
+                         vector<InterRes> &postp_buff, const int n_crrs) {
+  constexpr int offset = 0;
+
+  for (int wi_idx = offset, src = offset * 3; wi_idx < n_crrs; ++wi_idx) {
+    InterRes &dst_res = postp_buff[wi_idx];
+
+    for (int inner_func_index = 0; inner_func_index < 3;
+         ++inner_func_index, ++src) {
+      const CRRResParams &src_res = res_params[src];
+
+      for (int i = 0; i < 4; ++i) {
+        if (inner_func_index == 0) {
+          dst_res.pgreek[i] = src_res.pgreek[i];
+        }
+      }
+
+      dst_res.vals[inner_func_index] = src_res.optval0;
+    }
+  }
+}
+
+// Computes the Premium and Greeks
+OutputRes ComputeOutput(const InputData &inp, const CRRInParams &in_params,
+                        const InterRes &res_params) {
+  double h;
+  OutputRes res;
+  h = inp.spot * (in_params.u2[0] - 1 / in_params.u2[0]);
+  res.value = res_params.pgreek[1];
+  res.delta = (res_params.pgreek[2] - res_params.pgreek[0]) / h;
+  res.gamma = 2 / h *
+              ((res_params.pgreek[2] - res_params.pgreek[1]) / inp.spot /
+                   (in_params.u2[0] - 1) -
+               (res_params.pgreek[1] - res_params.pgreek[0]) / inp.spot /
+                   (1 - (1 / in_params.u2[0])));
+  res.theta =
+      (res_params.vals[0] - res_params.pgreek[3]) / 4 / inp.t * inp.n_steps;
+  res.rho = (res_params.vals[1] - res.value) / kEpsilon;
+  res.vega = (res_params.vals[2] - res.value) / kEpsilon;
+  return res;
+}
+
+// Perform CRR solving using the CPU and compare FPGA resutls with CPU results
+// to test correctness.
+void TestCorrectness(int k, int n_crrs, bool &pass, const InputData &inp,
+                     CRRInParams &vals, const OutputRes &fpga_res) {
+  if (k == 0) {
+    std::cout << "\n============= Correctness Test ============= \n";
+    std::cout << "Running analytical correctness checks... \n";
+  }
+
+  // This CRR benchmark ensures a minimum 4 decimal points match between FPGA and CPU
+  // "threshold" is chosen to enforce this guarantee
+  float threshold = 0.00001;
+  int i, j, q;
+  double x;
+  int n_steps = vals.n_steps;
+  int m = n_steps + kOpt0;
+  vector<double> pvalue(kMaxNSteps3);
+  vector<double> pvalue_1(kMaxNSteps1);
+  vector<double> pvalue_2(kMaxNSteps1);
+  vector<double> pgreek(5);
+  InterRes cpu_res_params;
+  OutputRes cpu_res;
+
+  // option value computed at each final node
+  x = vals.umin[0];
+  for (i = 0; i <= m; i++, x *= vals.u2[0]) {
+    pvalue[i] = fmax(inp.cp * (x - inp.strike), 0.0);
+  }
+
+  // backward recursion to evaluate option price
+  for (i = m - 1; i >= 0; i--) {
+    vals.umin[0] *= vals.u[0];
+    x = vals.umin[0];
+    for (j = 0; j <= i; j++, x *= vals.u2[0]) {
+      pvalue[j] = fmax(vals.c1[0] * pvalue[j] + vals.c2[0] * pvalue[j + 1],
+                       inp.cp * (x - inp.strike));
+    }
+    if (i == 4) {
+      pgreek[4] = pvalue[2];
+    }
+    if (i == 2) {
+      for (q = 0; q <= 2; q++) {
+        pgreek[q + 1] = pvalue[q];
+      }
+    }
+  }
+  cpu_res_params.vals[0] = pvalue[0];
+
+  // the above computation is repeated for each option price
+  x = vals.umin[1];
+  for (i = 0; i <= n_steps; i++, x *= vals.u2[1]) {
+    pvalue_1[i] = fmax(inp.cp * (x - inp.strike), 0.0);
+  }
+
+  for (i = n_steps - 1; i >= 0; i--) {
+    vals.umin[1] *= vals.u[1];
+    x = vals.umin[1];
+
+    for (j = 0; j <= i; j++, x *= vals.u2[1]) {
+      pvalue_1[j] =
+          fmax(vals.c1[1] * pvalue_1[j] + vals.c2[1] * pvalue_1[j + 1],
+               inp.cp * (x - inp.strike));
+    }
+  }
+  cpu_res_params.vals[1] = pvalue_1[0];
+
+  x = vals.umin[2];
+  for (i = 0; i <= n_steps; i++, x *= vals.u2[2]) {
+    pvalue_2[i] = fmax(inp.cp * (x - inp.strike), 0.0);
+  }
+
+  for (i = n_steps - 1; i >= 0; i--) {
+    vals.umin[2] *= vals.u[2];
+    x = vals.umin[2];
+    for (j = 0; j <= i; j++, x *= vals.u2[2]) {
+      pvalue_2[j] =
+          fmax(vals.c1[2] * pvalue_2[j] + vals.c2[2] * pvalue_2[j + 1],
+               inp.cp * (x - inp.strike));
+    }
+  }
+  cpu_res_params.vals[2] = pvalue_2[0];
+  pgreek[0] = 0;
+
+  for (i = 1; i < 5; ++i) {
+    cpu_res_params.pgreek[i - 1] = pgreek[i];
+  }
+
+  cpu_res = ComputeOutput(inp, vals, cpu_res_params);
+
+  if (abs(cpu_res.value - fpga_res.value) > threshold) {
+    pass = false;
+    std::cout << "fpga_res.value " << k << " = " << std::fixed
+              << std::setprecision(20) << fpga_res.value << "\n";
+    std::cout << "cpu_res.value " << k << " = " << std::fixed
+              << std::setprecision(20) << cpu_res.value << "\n";
+    std::cout << "Mismatch detected for value of crr " << k << "\n";
+  }
+  if (abs(cpu_res.delta - fpga_res.delta) > threshold) {
+    pass = false;
+    std::cout << "fpga_res.delta " << k << " = " << std::fixed
+              << std::setprecision(20) << fpga_res.delta << "\n";
+    std::cout << "cpu_res.delta " << k << " = " << std::fixed
+              << std::setprecision(20) << cpu_res.delta << "\n";
+    std::cout << "Mismatch detected for value of crr " << k << "\n";
+  }
+  if (abs(cpu_res.gamma - fpga_res.gamma) > threshold) {
+    pass = false;
+    std::cout << "fpga_res.gamma " << k << " = " << std::fixed
+              << std::setprecision(20) << fpga_res.gamma << "\n";
+    std::cout << "cpu_res.gamma " << k << " = " << std::fixed
+              << std::setprecision(20) << cpu_res.gamma << "\n";
+    std::cout << "Mismatch detected for value of crr " << k << "\n";
+  }
+  if (abs(cpu_res.vega - fpga_res.vega) > threshold) {
+    pass = false;
+    std::cout << "fpga_res.vega " << k << " = " << std::fixed
+              << std::setprecision(20) << fpga_res.vega << "\n";
+    std::cout << "cpu_res.vega " << k << " = " << std::fixed
+              << std::setprecision(20) << cpu_res.vega << "\n";
+    std::cout << "Mismatch detected for value of crr " << k << "\n";
+  }
+  if (abs(cpu_res.theta - fpga_res.theta) > threshold) {
+    pass = false;
+    std::cout << "fpga_res.theta " << k << " = " << std::fixed
+              << std::setprecision(20) << fpga_res.theta << "\n";
+    std::cout << "cpu_res.theta " << k << " = " << std::fixed
+              << std::setprecision(20) << cpu_res.theta << "\n";
+    std::cout << "Mismatch detected for value of crr " << k << "\n";
+  }
+  if (abs(cpu_res.rho - fpga_res.rho) > threshold) {
+    pass = false;
+    std::cout << "fpga_res.rho " << k << " = " << std::fixed
+              << std::setprecision(20) << fpga_res.rho << "\n";
+    std::cout << "cpu_res.rho " << k << " = " << std::fixed
+              << std::setprecision(20) << cpu_res.rho << "\n";
+    std::cout << "Mismatch detected for value of crr " << k << "\n";
+  }
+
+  if (k == n_crrs - 1) {
+    std::cout << "CPU-FPGA Equivalence: " << (pass ? "PASS" : "FAIL") << "\n";
+  }
+}
+
+// Print out the achieved CRR throughput
+void TestThroughput(const double &time, const int &n_crrs) {
+  std::cout << "\n============= Throughput Test =============\n";
+
+  std::cout << "   Avg throughput:   " << std::fixed << std::setprecision(1)
+            << (n_crrs / time) << " assets/s\n";
+}
+
+int main(int argc, char *argv[]) {
+  string infilename = "";
+  string outfilename = "";
+
+  const string default_ifile = "src/data/ordered_inputs.csv";
+  const string default_ofile = "src/data/ordered_outputs.csv";
+
+  char str_buffer[kMaxStringLen] = {0};
+  for (int i = 1; i < argc; i++) {
+    if (argv[i][0] == '-') {
+      string sarg(argv[i]);
+
+      FindGetArgString(sarg, "-o=", str_buffer, kMaxStringLen);
+      FindGetArgString(sarg, "--output-file=", str_buffer, kMaxStringLen);
+    } else {
+      infilename = string(argv[i]);
+    }
+  }
+
+  try {
+#if defined(FPGA_EMULATOR)
+    intel::fpga_emulator_selector device_selector;
+#else
+    intel::fpga_selector device_selector;
+#endif
+
+    queue q(device_selector, dpc_common::exception_handler);
+
+    std::cout << "Running on device:  "
+              << q.get_device().get_info<info::device::name>().c_str() << "\n";
+
+    device device = q.get_device();
+    std::cout << "Device name: "
+              << device.get_info<info::device::name>().c_str() << "\n \n \n";
+
+    vector<InputData> inp;
+
+    // Get input file name, if users don't have their test input file, this
+    // design will use the default input file
+    if (infilename == "") {
+      infilename = default_ifile;
+    }
+    ifstream inputFile(infilename);
+
+    if (!inputFile.is_open()) {
+      std::cerr << "Input file doesn't exist \n";
+      return 1;
+    }
+
+    // Check input file format
+    string filename = infilename;
+    std::size_t found = filename.find_last_of(".");
+    if (!(filename.substr(found + 1).compare("csv") == 0)) {
+      std::cerr << "Input file format only support .csv\n";
+      return 1;
+    }
+
+    // Get output file name, if users don't define output file name, the design
+    // will use the default output file
+    outfilename = default_ofile;
+    if (strlen(str_buffer)) {
+      outfilename = string(str_buffer);
+    }
+
+    // Check output file format
+    filename = outfilename;
+    found = filename.find_last_of(".");
+    if (!(filename.substr(found + 1).compare("csv") == 0)) {
+      std::cerr << "Output file format only support .csv\n";
+      return 1;
+    }
+
+    // Read inputs data from input file
+    ReadInputFromFile(inputFile, inp);
+
+// Get the number of data from the input file
+// Emulator mode only goes through one input (or through OUTER_UNROLL inputs) to
+// ensure fast runtime
+#if defined(FPGA_EMULATOR)
+    int temp_crrs = 1;
+#else
+    int temp_crrs = inp.size();
+#endif
+
+    // Check if n_crrs >= OUTER_UNROLL
+    if (OUTER_UNROLL >= temp_crrs) {
+      if (inp.size() < OUTER_UNROLL) {
+        std::cerr << "Input size must be greater than or equal to OUTER_UNROLL\n";
+        return 1;
+      } else {
+        temp_crrs = OUTER_UNROLL;
+      }
+    }
+
+    const int n_crrs = temp_crrs;
+
+    vector<CRRInParams> in_params(n_crrs);
+    vector<CRRArrayEles> array_params(n_crrs);
+
+    for (int j = 0; j < n_crrs; ++j) {
+      in_params[j] = PrepareData(inp[j]);
+      array_params[j] = PrepareArrData(in_params[j]);
+    }
+
+    // following vectors are arguments for CrrSolver
+    vector<CRRMeta> in_buff_params(n_crrs * 3);
+    vector<CRRPerStepMeta> in_buff2_params(n_crrs * 3);
+
+    vector<CRRResParams> res_params(n_crrs * 3);
+    vector<CRRResParams> res_params_dummy(n_crrs * 3);
+
+    // Prepare metadata as input to kernel
+    PrepareKernelData(in_params, array_params, in_buff_params, in_buff2_params,
+                      n_crrs);
+
+    // warmup run - use this run to warmup accelerator
+    CrrSolver(n_crrs, in_buff_params, res_params_dummy, in_buff2_params,
+               q);
+    // Timed run - profile performance
+    double time = CrrSolver(n_crrs, in_buff_params, res_params,
+                             in_buff2_params, q);
+    bool pass = true;
+
+    // Postprocessing step
+    // process_res used to compute final results
+    vector<InterRes> process_res(n_crrs);
+    ProcessKernelResult(res_params, process_res, n_crrs);
+
+    vector<OutputRes> result(n_crrs);
+    for (int i = 0; i < n_crrs; ++i) {
+      result[i] = ComputeOutput(inp[i], in_params[i], process_res[i]);
+      TestCorrectness(i, n_crrs, pass, inp[i], in_params[i], result[i]);
+    }
+
+    // Write outputs data to output file
+    ofstream outputFile(outfilename);
+
+    WriteOutputToFile(outputFile, result);
+
+    TestThroughput(time, n_crrs);
+
+  } catch (sycl::exception const &e) {
+    std::cout << "Caught a synchronous SYCL exception: " << e.what() << "\n";
+    std::cout << "   If you are targeting an FPGA hardware, "
+                 "ensure that your system is plugged to an FPGA board that is "
+                 "set up correctly\n";
+    std::cout << "   If you are targeting the FPGA emulator, compile with "
+                 "-DFPGA_EMULATOR\n";
+    return 1;
+  }
+  return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/CMakeLists.txt
new file mode 100755
index 0000000000..9ac77b0aff
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(GZip)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/README.md b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/README.md
new file mode 100755
index 0000000000..18117a82a5
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/README.md
@@ -0,0 +1,201 @@
+# GZIP Compression
+Reference design demonstrating high-performance GZIP compression on FPGA.
+ 
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. Additional reference material specific to this GZIP implementation is provided in the References section of this README.
+ 
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC)  with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC)  with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               | How to implement a high performance multi-engine compression algorithm on FPGA
+| Time to complete                  | 1 hr (not including compile time)
+ 
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+ 
+**Performance**
+Please refer to performance disclaimer at the end of this README.
+
+| Device                                                | Throughput
+|:---                                                   |:---
+| Intel® PAC with Intel Arria® 10 GX FPGA               | 1 engine @ 3.4 GB/s
+| Intel® PAC with Intel Stratix® 10 SX FPGA             | 2 engines @ 5.5 GB/s each = 11.0 GB/s total
+
+ 
+## Purpose
+
+This DPC++ reference design implements a compression algorithm. The implementation is optimized for the FPGA device. The compression result is GZIP-compatible and can be decompressed with GUNZIP. The GZIP output file format is compatible with GZIP's DEFLATE algorithm, and follows a fixed subset of [RFC 1951](https://www.ietf.org/rfc/rfc1951.txt). See the References section for more specific references. 
+
+The algorithm uses a GZIP-compatible Limpel-Ziv 77 (LZ77) algorithm for data de-duplication, and a GZIP-compatible Static Huffman algorithm for bit reduction. The implementation includes three FPGA accelerated tasks (LZ77, Static Huffman and CRC). 
+
+The FPGA implementation of the algorithm enables either one or two independent GZIP compute engines to operate in parallel on the FPGA. The number of engines is constrained by the available FPGA resources. By default, the design is parameterized to create a single engine when the design is compiled targeting Intel® PAC with Intel Arria® 10 GX FPGA. Two engines are created when targeting Intel® PAC with Intel Stratix® 10 SX FPGA, a larger device.
+ 
+## Key Implementation Details
+
+ | Kernel                     | Description
+---                          |---
+| LZ Reduction               | Implements a LZ77 algorithm for data de-duplication. The algorithm produces distance and length information that is compatible with GZIP's DEFLATE implementation. 
+| Static Huffman             | Uses the same Static Huffman codes used by GZIP's DEFLATE algorithm when it chooses a Static Huffman coding scheme for bit reduction. This choice maintains compatibility with GUNZIP. 
+| CRC                        | Adds a CRC checksum based on the input file; this is required by the gzip file format 
+
+To optimize performance, GZIP leverages techniques discussed in the following FPGA tutorials: 
+* **Double Buffering to Overlap Kernel Execution with Buffer Transfers and Host Processing** (double_buffering)
+* **On-Chip Memory Attributes** (mem_config)
+
+
+## License  
+This code sample is licensed under MIT license.
+ 
+ 
+## Building the `gzip` Reference Design
+ 
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+ 
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+ 
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 24h.
+ 
+### On a Linux* System
+ 
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+ 
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+ 
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+ 
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the optimization report: 
+     ```
+     make report
+     ``` 
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/gzip.fpga.tar.gz" download>here</a>.
+ 
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. 
+ 
+1. Enter the source file directory.
+   ```
+   cd src
+   ```
+ 
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+ 
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      ninja fpga_emu
+      ```
+ 
+   * Generate the optimization report:
+ 
+     ```
+     ninja report
+     ```
+     If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+     ```
+     ninja report_s10_pac
+     ```     
+   * Compiling for FPGA hardware is not yet supported on Windows.
+ 
+ ### In Third-Party Integrated Development Environments (IDEs)
+ 
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+ 
+ 
+## Running the Reference Design
+ 
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./gzip.fpga_emu <input_file> [-o=<output_file>]     (Linux)
+     gzip.fpga_emu.exe <input_file> [-o=<output_file>]   (Windows)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./gzip.fpga <input_file> [-o=<output_file>]         (Linux)
+     ```
+ ### Application Parameters
+
+| Argument | Description
+---        |---
+| `<input_file>` | Mandatory argument that specifies the file to be compressed. Use a 120+ MB file to achieve peak performance.
+| `-o=<output_file>` | Optional argument that specifies the name of the output file. The default name of the output file is `<input_file>.gz`. When targeting Intel Stratix® 10 SX, the single `<input_file>` is fed to both engines, yielding two identical output files, using `<output_file>` as the basis for the filenames.
+ 
+### Example of Output
+ 
+```
+Running on device:  pac_a10 : Intel PAC Platform (pac_ee00000)
+Throughput: 3.4321 GB/s
+Compression Ratio 33.2737%
+PASSED
+```
+## Additional Design Information
+### Source Code Explanation
+
+| File                         | Description 
+---                            |---
+| `gzip.cpp`                   | Contains the `main()` function and the top-level interfaces to the SYCL* GZIP functions.
+| `gzipkernel.cpp`            | Contains the SYCL* kernels used to implement GZIP. 
+| `CompareGzip.cpp`            | Contains code to compare a GZIP-compatible file with the original input.
+| `WriteGzip.cpp`              | Contains code to write a GZIP compatible file. 
+| `crc32.cpp`                  | Contains code to calculate a 32-bit CRC that is compatible with the GZIP file format and to combine multiple 32-bit CRC values. It is used to account only for the CRC of the last few bytes in the file, which are not processed by the accelerated CRC kernel. 
+| `kernels.hpp`                  | Contains miscellaneous defines and structure definitions required by the LZReduction and Static Huffman kernels.
+| `crc32.hpp`                    | Header file for `crc32.cpp`.
+| `gzipkernel.hpp`              | Header file for `gzipkernels.cpp`.
+| `CompareGzip.hpp`              | Header file for `CompareGzip.cpp`.
+| `WriteGzip.hpp`                | Header file for `WriteGzip.cpp`. 
+
+### Compiler Flags Used
+
+| Flag | Description
+---    |---
+`-Xshardware` | Target FPGA hardware (as opposed to FPGA emulator)
+`-Xsparallel=2` | Uses 2 cores when compiling the bitstream through Quartus
+`-Xsseed=1` | Uses seed 1 during Quartus, yields slightly higher fmax
+`-Xsnum-reorder=6` | On Intel Stratix® 10 SX only, specify a wider data path for read data from global memory 
+`-DNUM_ENGINES=<1|2>` | Specifies that 1 GZIP engine should be compiled when targeting Arria® 10 GX and 2 engines when targeting Intel Stratix® 10 SX
+
+
+### Performance disclaimers
+
+Tests document performance of components on a particular test, in specific systems. Differences in hardware, software, or configuration will affect actual performance. Consult other sources of information to evaluate performance as you consider your purchase.  For more complete information about performance and benchmark results, visit [www.intel.com/benchmarks](www.intel.com/benchmarks).
+
+Performance results are based on testing as of July 29, 2020 and may not reflect all publicly available security updates.  See configuration disclosure for details.  No product or component can be absolutely secure.
+
+Intel technologies’ features and benefits depend on system configuration and may require enabled hardware, software or service activation. Performance varies depending on system configuration. Check with your system manufacturer or retailer or learn more at [intel.com](www.intel.com).
+
+The performance was measured by Intel on July 29, 2020
+
+Intel and the Intel logo are trademarks of Intel Corporation or its subsidiaries in the U.S. and/or other countries.
+
+(C) Intel Corporation.
+
+### References
+[Khronous SYCL Resources](https://www.khronos.org/sycl/resources)
+
+[Intel GZIP OpenCL Design Example](https://www.intel.com/content/www/us/en/programmable/support/support-resources/design-examples/design-software/opencl/gzip-compression.html)
+
+[RFC 1951 - DEFLATE Data Format](https://www.ietf.org/rfc/rfc1951.txt)
+
+[RFC 1952 - GZIP Specification 4.3](https://www.ietf.org/rfc/rfc1952.txt)
+
+[OpenCL Intercept Layer](https://github.com/intel/opencl-intercept-layer)
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/Zlib_License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/Zlib_License.txt
new file mode 100755
index 0000000000..a75dd96a90
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/Zlib_License.txt
@@ -0,0 +1,25 @@
+zlib License
+
+  zlib.h -- interface of the 'zlib' general purpose compression library
+  version 1.2.11, January 15th, 2017
+
+  Copyright (C) 1995-2017 Jean-loup Gailly and Mark Adler
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  jloup@gzip.org          madler@alumni.caltech.edu
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.sln b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.sln
new file mode 100755
index 0000000000..580f35f08b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gzip", "gzip.vcxproj", "{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Debug|x64.ActiveCfg = Debug|x64
+		{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Debug|x64.Build.0 = Debug|x64
+		{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Release|x64.ActiveCfg = Release|x64
+		{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {92BEFAAB-0365-4E5A-9C4A-E50AB49B2A6B}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj
new file mode 100755
index 0000000000..cf6a2462d2
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj
@@ -0,0 +1,174 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\CompareGzip.cpp" />
+    <ClCompile Include="src\crc32.cpp" />
+    <ClCompile Include="src\gzip.cpp" />
+    <ClCompile Include="src\gzipkernel.cpp" />
+    <ClCompile Include="src\WriteGzip.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="src\CompareGzip.h" />
+    <ClInclude Include="src\crc32.h" />
+    <ClInclude Include="src\gzipkernel.h" />
+    <ClInclude Include="src\kernels.h" />
+    <ClInclude Include="src\WriteGzip.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{cf6a576b-665d-4f24-bb62-0dae7a7b3c64}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>gzip</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>
+      </ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+    <Manifest />
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>
+      </ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj.user b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj.user
new file mode 100755
index 0000000000..1956841792
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/gzip.vcxproj.user
@@ -0,0 +1,14 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <ShowAllFiles>false</ShowAllFiles>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LocalDebuggerCommandArguments>src/gzip.cpp -o=test.gz</LocalDebuggerCommandArguments>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LocalDebuggerCommandArguments>src/gzip.cpp -o=test.gz</LocalDebuggerCommandArguments>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+</Project>
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/sample.json b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/sample.json
new file mode 100755
index 0000000000..a6d65ecd17
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/sample.json
@@ -0,0 +1,51 @@
+{
+  "guid": "D55081EB-669D-4832-BCE6-23EE2ACA9F0F",
+  "name": "GZIP Compression",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Reference Designs"],
+  "description": "Reference design demonstrating high-performance GZIP compression on FPGA",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "builder": ["ide", "cmake"],
+  "targetDevice": ["FPGA"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./gzip.fpga_emu ../src/gzip.cpp -o=test.gz"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "gzip.fpga_emu.exe ../src/gzip.cpp -o=test.gz"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CMakeLists.txt
new file mode 100755
index 0000000000..bf6125045f
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CMakeLists.txt
@@ -0,0 +1,125 @@
+set(DEVICE_SOURCE_FILE gzipkernel.cpp)
+set(DEVICE_HEADER_FILE gzipkernel.hpp) 
+set(HOST_SOURCE_FILE gzip.cpp crc32.cpp WriteGzip.cpp CompareGzip.cpp)
+
+set(TARGET_NAME gzip)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+set(REPORTS_TARGET ${TARGET_NAME}_report)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Design specific constant values
+
+# To increase NUM_ENGINES to greater than 2, must also statically declare more engines in gzipkernel.cpp --> SubmitGzipTasks()
+set(NUM_ENGINES_A10 1)
+set(NUM_ENGINES_S10 2)
+set(NUM_REORDER "")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+SET(NUM_ENGINES ${NUM_ENGINES_A10})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+    SET(NUM_ENGINES ${NUM_ENGINES_S10})
+    set(NUM_REORDER "-Xsnum-reorder=6")
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+#specify -MMD -fsycl-link-targets=... instead of -fintelfpga to workaround known issue; lower report quality 
+set(HARDWARE_COMPILE_FLAGS -MMD -fsycl-link-targets=spir64_fpga-unknown-unknown-sycldevice -c -DNUM_ENGINES=${NUM_ENGINES})
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+separate_arguments(USER_HARDWARE_FLAGS)
+set(HARDWARE_LINK_FLAGS -fintelfpga -Xshardware -Xsparallel=2 -Xsseed=1 ${NUM_REORDER} -Xsboard=${_FPGA_BOARD}  ${USER_HARDWARE_FLAGS} -DNUM_ENGINES=${NUM_ENGINES})
+set(FINAL_LINK_FLAGS -fintelfpga -DNUM_ENGINES=${NUM_ENGINES})
+
+set(EMULATOR_COMPILE_FLAGS "-v -v -v -g0 -fintelfpga -DFPGA_EMULATOR -DNUM_ENGINES=${NUM_ENGINES}")
+set(EMULATOR_LINK_FLAGS -fintelfpga)
+
+# fpga emulator
+if(WIN32)
+    set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+    add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+    separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+    add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+                   COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} ${CMAKE_CURRENT_SOURCE_DIR}/gzip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/crc32.cpp ${CMAKE_CURRENT_SOURCE_DIR}/WriteGzip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/CompareGzip.cpp  -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+                   DEPENDS ${DEVICE_SOURCE_FILE} ${HOST_SOURCE_FILE})
+else()
+    add_executable(${EMULATOR_TARGET} ${DEVICE_SOURCE_FILE} ${HOST_SOURCE_FILE})
+    add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpga
+if(WIN32)
+    add_custom_target(fpga
+                  COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+    set(DEVICE_FPGA_OBJ "gzipkernel_fpga.o")
+    set(DEVICE_IMAGE_FPGA_OBJ "gzipkernel_fpga.a")
+    set(HOST_SOURCE_FILES_WITH_PATH ${CMAKE_CURRENT_SOURCE_DIR}/gzip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/crc32.cpp ${CMAKE_CURRENT_SOURCE_DIR}/WriteGzip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/CompareGzip.cpp)
+
+    add_custom_command(OUTPUT ${DEVICE_FPGA_OBJ}
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${DEVICE_FPGA_OBJ}
+                       DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE})
+
+    set(OBJ_FILES)
+    foreach(HOST_FILE ${HOST_SOURCE_FILES_WITH_PATH})
+      set(HOST_FPGA_OBJ ${HOST_FILE}.o)
+      add_custom_command(OUTPUT ${HOST_FPGA_OBJ}
+                         COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${HOST_FILE} -o ${HOST_FPGA_OBJ}
+                         DEPENDS ${HOST_FILE})
+      list(APPEND OBJ_FILES ${HOST_FPGA_OBJ})
+    endforeach()
+
+    add_custom_command(OUTPUT ${DEVICE_IMAGE_FPGA_OBJ}
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link=image ${DEVICE_FPGA_OBJ} -o ${DEVICE_IMAGE_FPGA_OBJ}
+                       DEPENDS ${DEVICE_FPGA_OBJ} ${OBJ_FILES})
+
+    add_custom_command(OUTPUT ${FPGA_TARGET}
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${FINAL_LINK_FLAGS} ${OBJ_FILES} ${DEVICE_IMAGE_FPGA_OBJ} -o  ${CMAKE_BINARY_DIR}/${FPGA_TARGET}
+                       DEPENDS ${DEVICE_IMAGE_FPGA_OBJ} ${OBJ_FILES})
+endif()
+
+# fpga report
+if(WIN32)
+    add_custom_target(report DEPENDS ${REPORTS_TARGET} )
+
+    separate_arguments(WIN_FLAGS WINDOWS_COMMAND)
+    add_custom_command(OUTPUT ${REPORTS_TARGET}
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${WIN_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET}
+                      DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE})
+
+else()
+    add_custom_target(report DEPENDS ${REPORTS_TARGET} )
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} ${DEVICE_SOURCE_FILE} COPYONLY)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/kernels.hpp kernels.hpp COPYONLY)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_HEADER_FILE} ${DEVICE_HEADER_FILE} COPYONLY)
+
+    add_custom_command(OUTPUT ${REPORTS_TARGET}
+                      COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${DEVICE_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET}
+                      DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE} kernels.hpp)
+endif()
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}.fpga_emu Makefile -o=test.gz
+                  DEPENDS ${TARGET_NAME}.fpga_emu)
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.cpp
new file mode 100755
index 0000000000..b803dee96b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.cpp
@@ -0,0 +1,85 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#include "CompareGzip.hpp"
+
+// returns 0 on success, otherwise failure
+int CompareGzipFiles(
+    const std::string
+        &original_file,  // original input file to compare gzip uncompressed
+    const std::string &input_gzfile)  // gzip file to check
+{
+#ifdef _MSC_VER
+  std::cout
+      << "Info: skipping output verification on Windows, no builtin gunzip\n";
+  return 0;
+#else
+  //------------------------------------------------------------------
+  // assume all good to start with.
+
+  int gzipstatus = 0;
+
+  //------------------------------------------------------------------
+  // Create temporary output filename for gunzip
+
+  char tmp_name[] = "/tmp/gzip_fpga.XXXXXX";
+  mkstemp(tmp_name);
+  std::string outputfile = tmp_name;
+
+  //------------------------------------------------------------------
+  // Check that the original file and gzipped file exist.
+
+  //------------------------------------------------------------------
+  // gunzip the file produced to stdout, capturing to the temp file.
+
+  std::string cmd = "gunzip -c ";
+  cmd += input_gzfile;
+  cmd += " > " + outputfile;
+
+  int gzout = ::system(cmd.c_str());
+  if (gzout != 0) {
+    gzipstatus = 3;
+  }
+
+  //------------------------------------------------------------------
+  // diff the temp file and the original.
+
+  cmd = "diff -q " + outputfile + " " + original_file;
+  int diffout = ::system(cmd.c_str());
+  if (diffout != 0) {
+    gzipstatus = 4;
+  }
+
+  //------------------------------------------------------------------
+  // Cleanup, remove the temp file.
+
+  (void)::remove(outputfile.c_str());
+
+  return gzipstatus;
+#endif
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.hpp
new file mode 100755
index 0000000000..5624b97cea
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/CompareGzip.hpp
@@ -0,0 +1,41 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#ifndef __COMPAREGZIP_H__
+#define __COMPAREGZIP_H__
+#pragma once
+
+#include <iostream>
+#include <string>
+
+int CompareGzipFiles(
+    const std::string
+        &original_file,  // original input file to compare gzip uncompressed
+    const std::string &input_gzfile);  // gzip file to check
+
+#endif  //__COMPAREGZIP_H__
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.cpp
new file mode 100755
index 0000000000..71c370aa96
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.cpp
@@ -0,0 +1,163 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#define _CRT_SECURE_NO_WARNINGS
+#include "WriteGzip.hpp"
+
+#include <fcntl.h>
+#include <memory.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <CL/sycl.hpp>
+#include <chrono>
+#include <string>
+
+constexpr int kDeflated = 8;
+#define GZIP_MAGIC "\037\213"  // Magic header for gzip files, 1F 8B
+
+#define ORIG_NAME 0x08
+#define OS_CODE 0x03  // Unix OS_CODE
+
+typedef struct GzipHeader {
+  unsigned char magic[2];         // 0x1f, 0x8b
+  unsigned char compress_method;  // 0-7 reserved, 8=deflate -- kDeflated
+  unsigned char flags;            // b0: file probably ascii
+                                  // b1: header crc-16 present
+                                  // b2: extra field present
+                                  // b3: original file name present
+                                  // b4: file comment present
+                                  // b5,6,7: reserved
+  unsigned long time;             // file modification time in Unix format.
+                                  // Set this to 0 for now.
+
+  unsigned char extra;  // depends on compression method
+  unsigned char os;     // operating system on which compression took place
+
+  // ...
+  //  ? bytes ... compressd data ...
+
+  unsigned long crc;
+  unsigned long uncompressed_sz;
+
+} gzip_header, *pgzip_header;
+
+inline static void PutUlong(uint8_t *pc, unsigned long l) {
+  pc[0] = l & 0xff;
+  pc[1] = (l >> 8) & 0xff;
+  pc[2] = (l >> 16) & 0xff;
+  pc[3] = (l >> 24) & 0xff;
+}
+
+// returns 0 on success, otherwise failure
+int WriteBlockGzip(
+    std::string &original_filename,  // Original file name being compressed
+    std::string &out_filename,       // gzip filename
+    char *obuf,                      // pointer to compressed data block
+    size_t blen,                     // length of compressed data block
+    size_t ilen,                     // original block length
+    uint32_t buffer_crc)             // the block's crc
+{
+  //------------------------------------------------------------------
+  // Setup the gzip output file header.
+  //  max filename size is arbitrarily set to 256 bytes long
+  //  Method is always DEFLATE
+  //  Original filename is always set in header
+  //  timestamp is set to 0 - ignored by gunzip
+  //  deflate flags set to 0
+  //  OS code is 0
+
+  int max_filename_sz = 256;
+
+  unsigned char *pgziphdr =
+      (unsigned char *)malloc(sizeof(gzip_header) + max_filename_sz);
+
+  if (!pgziphdr) {
+    std::cout << "pgzip header cannot be allocated\n";
+    return 1;
+  }
+
+  pgziphdr[0] = GZIP_MAGIC[0];
+  pgziphdr[1] = GZIP_MAGIC[1];
+  pgziphdr[2] = kDeflated;
+  pgziphdr[3] = ORIG_NAME;
+
+  // Set time in header to 0, this is ignored by gunzip.
+  pgziphdr[4] = 0;
+  pgziphdr[5] = 0;
+  pgziphdr[6] = 0;
+  pgziphdr[7] = 0;
+
+  // Deflate flags
+  pgziphdr[8] = 0;
+
+  // OS code is Linux in this case.
+  pgziphdr[9] = OS_CODE;
+
+  int ondx = 10;
+
+  const char *p = original_filename.c_str();
+  do {
+    pgziphdr[ondx++] = (*p);
+  } while (*p++);
+
+  int header_bytes = ondx;
+
+  unsigned char prolog[8];
+
+  PutUlong(((unsigned char *)prolog), buffer_crc);
+  PutUlong(((unsigned char *)&prolog[4]), ilen);
+
+  FILE *fo = fopen(out_filename.c_str(), "w+");
+  if (ferror(fo)) {
+    std::cout << "Cannot open file for output: " << out_filename << "\n";
+    free(pgziphdr);
+    return 1;
+  }
+
+  fwrite(pgziphdr, 1, header_bytes, fo);
+  fwrite(obuf, 1, blen, fo);
+  fwrite(prolog, 1, 8, fo);
+
+  if (ferror(fo)) {
+    std::cout << "gzip output file write failure.\n";
+    free(pgziphdr);
+    return 1;
+  }
+
+  if (fclose(fo)) {
+    perror("close");
+    free(pgziphdr);
+    return 1;
+  }
+  free(pgziphdr);
+  return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.hpp
new file mode 100755
index 0000000000..66bc28e315
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/WriteGzip.hpp
@@ -0,0 +1,45 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#ifndef __WRITEGZIP_H__
+#define __WRITEGZIP_H__
+#pragma once
+
+#include <iostream>
+#include <string>
+
+// returns 0 on success, otherwise failure
+int WriteBlockGzip(
+    std::string &original_filename,  // Original file name being compressed
+    std::string &out_filename,       // gzip filename
+    char *obuf,                      // pointer to compressed data block
+    size_t blen,                     // length of compressed data block
+    size_t ilen,                     // original block length
+    uint32_t buffer_crc);            // the block's crc
+
+#endif  //__WRITEGZIP_H__
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/build.ninja b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/build.ninja
new file mode 100755
index 0000000000..29d50e63a0
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/build.ninja
@@ -0,0 +1,32 @@
+device_source_file = gzipkernel.cpp
+device_header_file = gzipkernel.h
+host_source_file = gzip.cpp crc32.cpp WriteGzip.cpp CompareGzip.cpp
+target_name = gzip
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware -Xsclock=280MHz -Xsparallel=2 -Xsseed=1
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+  command = dpcpp /GX ${emulator_flags} ${device_source_file} ${host_source_file} -o $out
+
+rule gen_report
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 ${device_source_file} ${host_source_file} -fsycl-link -o $out
+
+rule gen_report_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 ${device_source_file} ${host_source_file} -fsycl-link -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu 
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.cpp
new file mode 100755
index 0000000000..8e6c59c734
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.cpp
@@ -0,0 +1,126 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+/*
+ * Copyright (C) 1995-2006, 2010, 2011, 2012, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "crc32.hpp"
+
+// This table is CRC32s for all single byte values created by using the
+// makecrc.c utility from gzip for compatibility with gzip. makecrc.c can be
+// found in the gzip source code project found at
+// https://git.savannah.gnu.org/git/gzip.git. The polynomial 0xedb88320 is used
+// for gzip, and thus used to create this table.
+//
+// Not copyrighted 1990, Mark Adler.
+//
+const unsigned int crc32_table[] = {
+    0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L,
+    0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L,
+    0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L,
+    0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL,
+    0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L,
+    0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L,
+    0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L,
+    0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL,
+    0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L,
+    0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL,
+    0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L,
+    0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L,
+    0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L,
+    0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL,
+    0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL,
+    0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L,
+    0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL,
+    0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L,
+    0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L,
+    0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L,
+    0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL,
+    0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L,
+    0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L,
+    0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL,
+    0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L,
+    0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L,
+    0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L,
+    0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L,
+    0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L,
+    0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL,
+    0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL,
+    0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L,
+    0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L,
+    0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL,
+    0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL,
+    0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L,
+    0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL,
+    0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L,
+    0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL,
+    0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L,
+    0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL,
+    0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L,
+    0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L,
+    0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL,
+    0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L,
+    0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L,
+    0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L,
+    0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L,
+    0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L,
+    0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L,
+    0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
+    0x2d02ef8dL};
+
+//
+// This routine creates a Crc32 from a memory buffer (address, and length), and
+// a previous crc. This routine can be called iteratively on different portions
+// of the same buffer, using a previously returned crc value. The
+// value 0xffffffff is used for the first buffer invocation.
+unsigned int Crc32Host(
+    const char *pbuf,           // pointer to the buffer to crc
+    size_t sz,                  // number of bytes
+    unsigned int previous_crc)  // previous CRC, allows combining.
+{
+  unsigned int curr_crc = ~previous_crc;
+  if (sz) do {
+      curr_crc =
+          crc32_table[((int)curr_crc ^ (*pbuf++)) & 0xff] ^ (curr_crc >> 8);
+    } while (--sz);
+  return curr_crc ^ 0xffffffffL;
+}
+
+unsigned int Crc32(const char *in, size_t buffer_sz,
+                   unsigned int previous_crc) {
+  const int num_nibbles_parallel = 64;
+  const int num_sections =
+      buffer_sz / (num_nibbles_parallel / 2);  // how many loop iterations
+  // now deal with the remainder, this should be done on the software host
+  // the post-invert also happens inside crc_reference
+  const char *remaining_data = &in[num_sections * (num_nibbles_parallel / 2)];
+  int remaining_bytes = buffer_sz % (num_nibbles_parallel / 2);
+  return Crc32Host(remaining_data, remaining_bytes, previous_crc);
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.hpp
new file mode 100755
index 0000000000..138a8f0754
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/crc32.hpp
@@ -0,0 +1,46 @@
+// ==============================================================
+// Copyright Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#ifndef __CRC32_H__
+#define __CRC32_H__
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+
+uint32_t Crc32Host(
+    const char *pbuf,        // pointer to the buffer to crc
+    size_t sz,               // number of bytes
+    uint32_t previous_crc);  // previous CRC, allows combining. First invocation
+                             // would use 0xffffffff.
+uint32_t Crc32(const char *pbuf,        // pointer to the buffer to crc
+               size_t sz,               // number of bytes
+               uint32_t previous_crc);  // previous CRC, allows combining. First
+                                        // invocation would use 0xffffffff.
+
+#endif  //__CRC32_H__
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzip.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzip.cpp
new file mode 100755
index 0000000000..9ecfe11728
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzip.cpp
@@ -0,0 +1,520 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <chrono>
+#include <fstream>
+#include <string>
+
+#include "CompareGzip.hpp"
+#include "WriteGzip.hpp"
+#include "crc32.hpp"
+#include "dpc_common.hpp"
+#include "gzipkernel.hpp"
+#include "kernels.hpp"
+
+using namespace sycl;
+
+// The minimum file size of a file to be compressed.
+// Any filesize less than this results in an error.
+constexpr int minimum_filesize = kVec + 1;
+
+bool help = false;
+
+int CompressFile(queue &q, std::string &input_file, std::vector<std::string> outfilenames,
+                 int iterations, bool report);
+
+void Help(void) {
+  // Command line arguments.
+  // gzip [options] filetozip [options]
+  // -h,--help                    : help
+
+  // future options?
+  // -p,performance : output perf metrics
+  // -m,maxmapping=#  : maximum mapping size
+
+  std::cout << "gzip filename [options]\n";
+  std::cout << "  -h,--help                                : this help text\n";
+  std::cout
+      << "  -o=<filename>,--output-file=<filename>   : specify output file\n";
+}
+
+bool FindGetArg(std::string &arg, const char *str, int defaultval, int *val) {
+  std::size_t found = arg.find(str, 0, strlen(str));
+  if (found != std::string::npos) {
+    int value = atoi(&arg.c_str()[strlen(str)]);
+    *val = value;
+    return true;
+  }
+  return false;
+}
+
+constexpr int kMaxStringLen = 40;
+
+bool FindGetArgString(std::string &arg, const char *str, char *str_value,
+                      size_t maxchars) {
+  std::size_t found = arg.find(str, 0, strlen(str));
+  if (found != std::string::npos) {
+    const char *sptr = &arg.c_str()[strlen(str)];
+    for (int i = 0; i < maxchars - 1; i++) {
+      char ch = sptr[i];
+      switch (ch) {
+        case ' ':
+        case '\t':
+        case '\0':
+          str_value[i] = 0;
+          return true;
+          break;
+        default:
+          str_value[i] = ch;
+          break;
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+size_t SyclGetExecTimeNs(event e) {
+  size_t start_time =
+      e.get_profiling_info<info::event_profiling::command_start>();
+  size_t end_time =
+      e.get_profiling_info<info::event_profiling::command_end>();
+  return (end_time - start_time);
+}
+
+int main(int argc, char *argv[]) {
+  std::string infilename = "";
+
+  std::vector<std::string> outfilenames (kNumEngines);
+
+  char str_buffer[kMaxStringLen] = {0};
+
+  // Check the number of arguments specified
+  if (argc != 3) {
+    std::cerr << "Incorrect number of arguments. Correct usage: " << argv[0]
+              << " <input-file> -o=<output-file>\n";
+    return 1;
+  }
+
+  for (int i = 1; i < argc; i++) {
+    if (argv[i][0] == '-') {
+      std::string sarg(argv[i]);
+      if (std::string(argv[i]) == "-h") {
+        help = true;
+      }
+      if (std::string(argv[i]) == "--help") {
+        help = true;
+      }
+
+      FindGetArgString(sarg, "-o=", str_buffer, kMaxStringLen);
+      FindGetArgString(sarg, "--output-file=", str_buffer, kMaxStringLen);
+    } else {
+      infilename = std::string(argv[i]);
+    }
+  }
+
+  if (help) {
+    Help();
+    return 1;
+  }
+
+  try {
+#ifdef FPGA_EMULATOR
+    intel::fpga_emulator_selector device_selector;
+#else
+    intel::fpga_selector device_selector;
+#endif
+    auto prop_list = property_list{property::queue::enable_profiling()};
+    queue q(device_selector, dpc_common::exception_handler, prop_list);
+
+    std::cout << "Running on device:  "
+              << q.get_device().get_info<info::device::name>().c_str() << "\n";
+
+    if (infilename == "") {
+      std::cout << "Must specify a filename to compress\n\n";
+      Help();
+      return 1;
+    }
+
+    // next, check valid and acceptable parameter ranges.
+    // if output filename not set, use the default
+    // name, else use the name specified by the user
+    outfilenames[0] = std::string(infilename) + ".gz";
+    if (strlen(str_buffer)) {
+      outfilenames[0] = std::string(str_buffer);
+    }
+    for (size_t i=1; i< kNumEngines; i++) {
+      // Filenames will be of the form outfilename, outfilename2, outfilename3 etc.
+      outfilenames[i] = outfilenames[0] + std::to_string(i+1);
+    }
+
+    std::cout << "Launching GZIP application with " << kNumEngines
+              << " engines\n";
+
+#ifdef FPGA_EMULATOR
+    CompressFile(q, infilename, outfilenames, 1, true);
+#else
+    // warmup run - use this run to warmup accelerator. There are some steps in
+    // the runtime that are only executed on the first kernel invocation but not
+    // on subsequent invocations. So execute all that stuff here before we
+    // measure performance (in the next call to CompressFile().
+    CompressFile(q, infilename, outfilenames, 1, false);
+    // profile performance
+    CompressFile(q, infilename, outfilenames, 200, true);
+#endif
+  } catch (sycl::exception const &e) {
+    // Catches exceptions in the host code
+    std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+
+    // Most likely the runtime couldn't find FPGA hardware!
+    if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+      std::cout << "If you are targeting an FPGA, please ensure that your "
+                   "system has a correctly configured FPGA board.\n";
+      std::cout << "If you are targeting the FPGA emulator, compile with "
+                   "-DFPGA_EMULATOR.\n";
+    }
+    std::terminate();
+  }
+  return 0;
+}
+
+struct KernelInfo {
+  buffer<struct GzipOutInfo, 1> *gzip_out_buf;
+  buffer<unsigned, 1> *current_crc;
+  buffer<char, 1> *pobuf;
+  buffer<char, 1> *pibuf;
+  char *pobuf_decompress;
+
+  uint32_t buffer_crc[kMinBufferSize];
+  uint32_t refcrc;
+
+  const char *pref_buffer;
+  char *poutput_buffer;
+  size_t file_size;
+  struct GzipOutInfo out_info[kMinBufferSize];
+  int iteration;
+  bool last_block;
+};
+
+// returns 0 on success, otherwise a non-zero failure code.
+int CompressFile(queue &q, std::string &input_file, std::vector<std::string> outfilenames,
+                 int iterations, bool report) {
+  size_t isz;
+  char *pinbuf;
+
+  // Read the input file
+  std::string device_string =
+      q.get_device().get_info<info::device::name>().c_str();
+  bool prepin =
+      (device_string.find("s10") !=
+       std::string::npos);  // Check if "s10" is found in the device string. If
+                            // the device is S10, we pre-pin some buffers to
+                            // improve DMA performance, which is needed to
+                            // achieve peak kernel throughput. Pre-pinning is
+                            // only supported on the PAC-S10 BSP. It's not
+                            // needed on PAC-A10 to achieve peak performance.
+
+  std::ifstream file(input_file,
+                     std::ios::in | std::ios::binary | std::ios::ate);
+  if (file.is_open()) {
+    isz = file.tellg();
+    if (prepin) {
+      pinbuf = (char *)malloc_host(
+          isz, q.get_context());  // Pre-pin the buffer, for faster DMA
+    } else {                      // throughput, using malloc_host().
+      pinbuf = new char[isz];
+    }
+    file.seekg(0, std::ios::beg);
+    file.read(pinbuf, isz);
+    file.close();
+  } else {
+    std::cout << "Error: cannot read specified input file\n";
+    return 1;
+  }
+
+  if (isz < minimum_filesize) {
+    std::cout << "Minimum filesize for compression is " << minimum_filesize
+              << "\n";
+    return 1;
+  }
+
+  int buffers_count = iterations;
+
+  // Create an array of kernel info structures and create buffers for kernel
+  // input/output. The buffers are re-used between iterations, but enough 
+  // disjoint buffers are created to support double-buffering.
+  struct KernelInfo *kinfo[kNumEngines];
+  for (size_t eng = 0; eng < kNumEngines; eng++) {
+    kinfo[eng] =
+        (struct KernelInfo *)malloc(sizeof(struct KernelInfo) * buffers_count);
+    if (kinfo[eng] == NULL) {
+      std::cout << "Cannot allocate kernel info buffer.\n";
+      return 1;
+    }
+    for (int i = 0; i < buffers_count; i++) {
+      kinfo[eng][i].file_size = isz;
+      // Allocating slightly larger buffers (+ 16 * kVec) to account for
+      // granularity of kernel writes
+      int outputSize = kinfo[eng][i].file_size + 16 * kVec < kMinBufferSize
+                           ? kMinBufferSize
+                           : kinfo[eng][i].file_size + 16 * kVec;
+
+      // Pre-pin buffer using malloc_host() to improve DMA bandwidth.
+      if (i >= 3) {
+        kinfo[eng][i].poutput_buffer = kinfo[eng][i - 3].poutput_buffer;
+      } else {
+        if (prepin) {
+          kinfo[eng][i].poutput_buffer =
+              (char *)malloc_host(outputSize, q.get_context());
+        } else {
+          kinfo[eng][i].poutput_buffer = (char *)malloc(outputSize);
+        }
+        if (kinfo[eng][i].poutput_buffer == NULL) {
+          std::cout << "Cannot allocate output buffer.\n";
+          free(kinfo);
+          return 1;
+        }
+        // zero pages to fully allocate them
+        memset(kinfo[eng][i].poutput_buffer, 0, outputSize);
+      }
+
+      kinfo[eng][i].last_block = true;
+      kinfo[eng][i].iteration = i;
+      kinfo[eng][i].pref_buffer = pinbuf;
+
+      kinfo[eng][i].gzip_out_buf =
+          i >= 3 ? kinfo[eng][i - 3].gzip_out_buf
+                 : new buffer<struct GzipOutInfo, 1>(kMinBufferSize);
+      kinfo[eng][i].current_crc = i >= 3
+                                      ? kinfo[eng][i - 3].current_crc
+                                      : new buffer<unsigned, 1>(kMinBufferSize);
+      kinfo[eng][i].pibuf = i >= 3
+                                ? kinfo[eng][i - 3].pibuf
+                                : new buffer<char, 1>(kinfo[eng][i].file_size);
+      kinfo[eng][i].pobuf =
+          i >= 3 ? kinfo[eng][i - 3].pobuf : new buffer<char, 1>(outputSize);
+      kinfo[eng][i].pobuf_decompress = (char *)malloc(kinfo[eng][i].file_size);
+    }
+  }
+
+  // Create events for the various parts of the execution so that we can profile
+  // their performance.
+  event e_input_dma     [kNumEngines][buffers_count]; // Input to the GZIP engine. This is a transfer from host to device.
+  event e_output_dma    [kNumEngines][buffers_count]; // Output from the GZIP engine. This is transfer from device to host.
+  event e_crc_dma       [kNumEngines][buffers_count]; // Transfer CRC from device to host
+  event e_size_dma      [kNumEngines][buffers_count]; // Transfer compressed file size from device to host
+  event e_k_crc         [kNumEngines][buffers_count]; // CRC kernel
+  event e_k_lz          [kNumEngines][buffers_count]; // LZ77 kernel
+  event e_k_huff        [kNumEngines][buffers_count]; // Huffman Encoding kernel
+
+#ifndef FPGA_EMULATOR
+  dpc_common::TimeInterval perf_timer;
+#endif
+
+  
+  /*************************************************/
+  /* Main loop where the actual execution happens  */
+  /*************************************************/
+  for (int i = 0; i < buffers_count; i++) {
+    for (size_t eng = 0; eng < kNumEngines; eng++) {
+      // Transfer the input data, to be compressed, from host to device.
+      e_input_dma[eng][i] = q.submit([&](handler &h) {
+        auto in_data =
+            kinfo[eng][i].pibuf->get_access<access::mode::discard_write>(h);
+        h.copy(kinfo[eng][i].pref_buffer, in_data);
+      });
+
+      /************************************/
+      /************************************/
+      /*         LAUNCH GZIP ENGINE       */
+      /************************************/
+      /************************************/
+      SubmitGzipTasks(q, kinfo[eng][i].file_size, kinfo[eng][i].pibuf,
+                      kinfo[eng][i].pobuf, kinfo[eng][i].gzip_out_buf,
+                      kinfo[eng][i].current_crc, kinfo[eng][i].last_block,
+                      e_k_crc[eng][i], e_k_lz[eng][i], e_k_huff[eng][i], eng);
+
+      // Transfer the output (compressed) data from device to host.
+      e_output_dma[eng][i] = q.submit([&](handler &h) {
+        auto out_data = kinfo[eng][i].pobuf->get_access<access::mode::read>(h);
+        h.copy(out_data, kinfo[eng][i].poutput_buffer);
+      });
+
+      // Transfer the file size of the compressed output file from device to host.
+      e_size_dma[eng][i] = q.submit([&](handler &h) {
+        auto out_data =
+            kinfo[eng][i].gzip_out_buf->get_access<access::mode::read>(h);
+        h.copy(out_data, kinfo[eng][i].out_info);
+      });
+
+      // Transfer the CRC of the compressed output file from device to host.
+      e_crc_dma[eng][i] = q.submit([&](handler &h) {
+        auto out_data =
+            kinfo[eng][i].current_crc->get_access<access::mode::read>(h);
+        h.copy(out_data, kinfo[eng][i].buffer_crc);
+      });
+    }
+  }
+
+  // Wait for all kernels to complete
+  for (int eng = 0; eng < kNumEngines; eng++) {
+    for (int i = 0; i < buffers_count; i++) {
+      e_output_dma[eng][i].wait();
+      e_size_dma[eng][i].wait();
+      e_crc_dma[eng][i].wait();
+    }
+  }
+
+// Stop the timer.
+#ifndef FPGA_EMULATOR
+  double diff_total = perf_timer.Elapsed();
+  double gbps = iterations * isz / (double)diff_total / 1000000000.0;
+#endif
+
+  // Check the compressed file size from each iteration. Make sure the size is actually
+  // less-than-or-equal to the input size. Also calculate the remaining CRC.
+  size_t compressed_sz[kNumEngines];
+  for (int eng = 0; eng < kNumEngines; eng++) {
+    compressed_sz[eng] = 0;
+    for (int i = 0; i < buffers_count; i++) {
+      if (kinfo[eng][i].out_info[0].compression_sz > kinfo[eng][i].file_size) {
+        std::cerr << "Unsupported: compressed file larger than input file( "
+                  << kinfo[eng][i].out_info[0].compression_sz << " )\n";
+        return 1;
+      }
+      // The majority of the CRC is calculated by the CRC kernel on the FPGA. But the kernel
+      // operates on quantized chunks of input data, so any remaining input data, that falls
+      // outside the quanta, is included in the overall CRC calculation via the following 
+      // function that runs on the host. The last argument is the running CRC that was computed
+      // on the FPGA.
+      kinfo[eng][i].buffer_crc[0] =
+          Crc32(kinfo[eng][i].pref_buffer, kinfo[eng][i].file_size,
+                kinfo[eng][i].buffer_crc[0]);
+      // Accumulate the compressed size across all iterations. Used to 
+      // compute compression ratio later.
+      compressed_sz[eng] += kinfo[eng][i].out_info[0].compression_sz;
+    }
+  }
+
+  // delete the file mapping now that all kernels are complete, and we've
+  // snapped the time delta
+  if (prepin) {
+    free(pinbuf, q.get_context());
+  } else {
+    delete pinbuf;
+  }
+
+  // Write the output compressed data from the first iteration of each engine, to a file.
+  for (int eng = 0; eng < kNumEngines; eng++) {
+    // WriteBlockGzip() returns 1 on failure
+    if (report && WriteBlockGzip(input_file, outfilenames[eng], kinfo[eng][0].poutput_buffer,
+                        kinfo[eng][0].out_info[0].compression_sz,
+                        kinfo[eng][0].file_size, kinfo[eng][0].buffer_crc[0])) {
+      std::cout << "FAILED\n";
+      return 1;
+    }        
+  }
+
+  // Decompress the output from engine-0 and compare against the input file. Only engine-0's
+  // output is verified since all engines are fed the same input data.
+  if (report && CompareGzipFiles(input_file, outfilenames[0])) {
+    std::cout << "FAILED\n";
+    return 1;
+  }
+
+  // Generate throughput report
+  // First gather all the execution times.
+  size_t time_k_crc[kNumEngines];
+  size_t time_k_lz[kNumEngines];
+  size_t time_k_huff[kNumEngines];
+  size_t time_input_dma[kNumEngines];
+  size_t time_output_dma[kNumEngines];
+  for (int eng = 0; eng < kNumEngines; eng++) {
+    time_k_crc[eng] = 0;
+    time_k_lz[eng] = 0;
+    time_k_huff[eng] = 0;
+    time_input_dma[eng] = 0;
+    time_output_dma[eng] = 0;
+    for (int i = 0; i < buffers_count; i++) {
+      e_k_crc[eng][i].wait();
+      e_k_lz[eng][i].wait();
+      e_k_huff[eng][i].wait();
+      time_k_crc[eng]       += SyclGetExecTimeNs(e_k_crc[eng][i]);
+      time_k_lz[eng]        += SyclGetExecTimeNs(e_k_lz[eng][i]);
+      time_k_huff[eng]      += SyclGetExecTimeNs(e_k_huff[eng][i]);
+      time_input_dma[eng]   += SyclGetExecTimeNs(e_input_dma[eng][i]);
+      time_output_dma[eng]  += SyclGetExecTimeNs(e_output_dma[eng][i]);
+    }
+  }
+
+  if (report) {
+    double compression_ratio =
+        (double)((double)compressed_sz[0] / (double)isz / iterations);
+#ifndef FPGA_EMULATOR
+    std::cout << "Throughput: " << kNumEngines * gbps << " GB/s\n\n";
+    for (int eng = 0; eng < kNumEngines; eng++) {
+      std::cout << "TP breakdown for engine #" << eng << " (GB/s)\n";
+      std::cout << "CRC = " << iterations * isz / (double)time_k_crc[eng]
+                << "\n";
+      std::cout << "LZ77 = " << iterations * isz / (double)time_k_lz[eng]
+                << "\n";
+      std::cout << "Huffman Encoding = "
+                << iterations * isz / (double)time_k_huff[eng] << "\n";
+      std::cout << "DMA host-to-device = "
+                << iterations * isz / (double)time_input_dma[eng] << "\n";
+      std::cout << "DMA device-to-host = "
+                << iterations * isz / (double)time_output_dma[eng] << "\n\n";
+    }
+#endif
+    std::cout << "Compression Ratio " << compression_ratio * 100 << "%\n";
+  }
+
+  // Cleanup anything that was allocated by this routine.
+  for (int eng = 0; eng < kNumEngines; eng++) {
+    for (int i = 0; i < buffers_count; i++) {
+      if (i < 3) {
+        delete kinfo[eng][i].gzip_out_buf;
+        delete kinfo[eng][i].current_crc;
+        delete kinfo[eng][i].pibuf;
+        delete kinfo[eng][i].pobuf;
+        if (prepin) {
+          free(kinfo[eng][i].poutput_buffer, q.get_context());
+        } else {
+          free(kinfo[eng][i].poutput_buffer);
+        }
+      }
+      free(kinfo[eng][i].pobuf_decompress);
+    }
+    free(kinfo[eng]);
+  }
+
+  if (report) std::cout << "PASSED\n";
+  return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.cpp
new file mode 100755
index 0000000000..01d69c1f9b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.cpp
@@ -0,0 +1,2406 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+/*
+ * Copyright (C) 1995-2006, 2010, 2011, 2012, 2016 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <CL/sycl.hpp>
+
+#include "gzipkernel.hpp"
+#include "kernels.hpp"
+
+using namespace sycl;
+
+// This reference design uses a template-based unroller. It's also possible
+// to specify this in a more concise way using a pragma. See the loop unroll
+// tutorial for more information.
+template <int Begin, int End>
+struct Unroller {
+  template <typename Action>
+  static void step(const Action &action) {
+    action(Begin);
+    Unroller<Begin + 1, End>::step(action);
+  }
+};
+
+template <int End>
+struct Unroller<End, End> {
+  template <typename Action>
+  static void step(const Action &action) {}
+};
+
+int GetHuffLiteralBits(unsigned char ch) {
+  CtData static_ltree[kLCodes + 2] = {
+      {12, 8},  {140, 8}, {76, 8},  {204, 8}, {44, 8},  {172, 8}, {108, 8},
+      {236, 8}, {28, 8},  {156, 8}, {92, 8},  {220, 8}, {60, 8},  {188, 8},
+      {124, 8}, {252, 8}, {2, 8},   {130, 8}, {66, 8},  {194, 8}, {34, 8},
+      {162, 8}, {98, 8},  {226, 8}, {18, 8},  {146, 8}, {82, 8},  {210, 8},
+      {50, 8},  {178, 8}, {114, 8}, {242, 8}, {10, 8},  {138, 8}, {74, 8},
+      {202, 8}, {42, 8},  {170, 8}, {106, 8}, {234, 8}, {26, 8},  {154, 8},
+      {90, 8},  {218, 8}, {58, 8},  {186, 8}, {122, 8}, {250, 8}, {6, 8},
+      {134, 8}, {70, 8},  {198, 8}, {38, 8},  {166, 8}, {102, 8}, {230, 8},
+      {22, 8},  {150, 8}, {86, 8},  {214, 8}, {54, 8},  {182, 8}, {118, 8},
+      {246, 8}, {14, 8},  {142, 8}, {78, 8},  {206, 8}, {46, 8},  {174, 8},
+      {110, 8}, {238, 8}, {30, 8},  {158, 8}, {94, 8},  {222, 8}, {62, 8},
+      {190, 8}, {126, 8}, {254, 8}, {1, 8},   {129, 8}, {65, 8},  {193, 8},
+      {33, 8},  {161, 8}, {97, 8},  {225, 8}, {17, 8},  {145, 8}, {81, 8},
+      {209, 8}, {49, 8},  {177, 8}, {113, 8}, {241, 8}, {9, 8},   {137, 8},
+      {73, 8},  {201, 8}, {41, 8},  {169, 8}, {105, 8}, {233, 8}, {25, 8},
+      {153, 8}, {89, 8},  {217, 8}, {57, 8},  {185, 8}, {121, 8}, {249, 8},
+      {5, 8},   {133, 8}, {69, 8},  {197, 8}, {37, 8},  {165, 8}, {101, 8},
+      {229, 8}, {21, 8},  {149, 8}, {85, 8},  {213, 8}, {53, 8},  {181, 8},
+      {117, 8}, {245, 8}, {13, 8},  {141, 8}, {77, 8},  {205, 8}, {45, 8},
+      {173, 8}, {109, 8}, {237, 8}, {29, 8},  {157, 8}, {93, 8},  {221, 8},
+      {61, 8},  {189, 8}, {125, 8}, {253, 8}, {19, 9},  {275, 9}, {147, 9},
+      {403, 9}, {83, 9},  {339, 9}, {211, 9}, {467, 9}, {51, 9},  {307, 9},
+      {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+      {267, 9}, {139, 9}, {395, 9}, {75, 9},  {331, 9}, {203, 9}, {459, 9},
+      {43, 9},  {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+      {491, 9}, {27, 9},  {283, 9}, {155, 9}, {411, 9}, {91, 9},  {347, 9},
+      {219, 9}, {475, 9}, {59, 9},  {315, 9}, {187, 9}, {443, 9}, {123, 9},
+      {379, 9}, {251, 9}, {507, 9}, {7, 9},   {263, 9}, {135, 9}, {391, 9},
+      {71, 9},  {327, 9}, {199, 9}, {455, 9}, {39, 9},  {295, 9}, {167, 9},
+      {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9},  {279, 9},
+      {151, 9}, {407, 9}, {87, 9},  {343, 9}, {215, 9}, {471, 9}, {55, 9},
+      {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+      {15, 9},  {271, 9}, {143, 9}, {399, 9}, {79, 9},  {335, 9}, {207, 9},
+      {463, 9}, {47, 9},  {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+      {239, 9}, {495, 9}, {31, 9},  {287, 9}, {159, 9}, {415, 9}, {95, 9},
+      {351, 9}, {223, 9}, {479, 9}, {63, 9},  {319, 9}, {191, 9}, {447, 9},
+      {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7},   {64, 7},  {32, 7},
+      {96, 7},  {16, 7},  {80, 7},  {48, 7},  {112, 7}, {8, 7},   {72, 7},
+      {40, 7},  {104, 7}, {24, 7},  {88, 7},  {56, 7},  {120, 7}, {4, 7},
+      {68, 7},  {36, 7},  {100, 7}, {20, 7},  {84, 7},  {52, 7},  {116, 7},
+      {3, 8},   {131, 8}, {67, 8},  {195, 8}, {35, 8},  {163, 8}, {99, 8},
+      {227, 8},
+  };
+  return static_ltree[ch].code;
+}
+
+int GetHuffLiteralLen(unsigned char ch) {
+  CtData static_ltree[kLCodes + 2] = {
+      {12, 8},  {140, 8}, {76, 8},  {204, 8}, {44, 8},  {172, 8}, {108, 8},
+      {236, 8}, {28, 8},  {156, 8}, {92, 8},  {220, 8}, {60, 8},  {188, 8},
+      {124, 8}, {252, 8}, {2, 8},   {130, 8}, {66, 8},  {194, 8}, {34, 8},
+      {162, 8}, {98, 8},  {226, 8}, {18, 8},  {146, 8}, {82, 8},  {210, 8},
+      {50, 8},  {178, 8}, {114, 8}, {242, 8}, {10, 8},  {138, 8}, {74, 8},
+      {202, 8}, {42, 8},  {170, 8}, {106, 8}, {234, 8}, {26, 8},  {154, 8},
+      {90, 8},  {218, 8}, {58, 8},  {186, 8}, {122, 8}, {250, 8}, {6, 8},
+      {134, 8}, {70, 8},  {198, 8}, {38, 8},  {166, 8}, {102, 8}, {230, 8},
+      {22, 8},  {150, 8}, {86, 8},  {214, 8}, {54, 8},  {182, 8}, {118, 8},
+      {246, 8}, {14, 8},  {142, 8}, {78, 8},  {206, 8}, {46, 8},  {174, 8},
+      {110, 8}, {238, 8}, {30, 8},  {158, 8}, {94, 8},  {222, 8}, {62, 8},
+      {190, 8}, {126, 8}, {254, 8}, {1, 8},   {129, 8}, {65, 8},  {193, 8},
+      {33, 8},  {161, 8}, {97, 8},  {225, 8}, {17, 8},  {145, 8}, {81, 8},
+      {209, 8}, {49, 8},  {177, 8}, {113, 8}, {241, 8}, {9, 8},   {137, 8},
+      {73, 8},  {201, 8}, {41, 8},  {169, 8}, {105, 8}, {233, 8}, {25, 8},
+      {153, 8}, {89, 8},  {217, 8}, {57, 8},  {185, 8}, {121, 8}, {249, 8},
+      {5, 8},   {133, 8}, {69, 8},  {197, 8}, {37, 8},  {165, 8}, {101, 8},
+      {229, 8}, {21, 8},  {149, 8}, {85, 8},  {213, 8}, {53, 8},  {181, 8},
+      {117, 8}, {245, 8}, {13, 8},  {141, 8}, {77, 8},  {205, 8}, {45, 8},
+      {173, 8}, {109, 8}, {237, 8}, {29, 8},  {157, 8}, {93, 8},  {221, 8},
+      {61, 8},  {189, 8}, {125, 8}, {253, 8}, {19, 9},  {275, 9}, {147, 9},
+      {403, 9}, {83, 9},  {339, 9}, {211, 9}, {467, 9}, {51, 9},  {307, 9},
+      {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+      {267, 9}, {139, 9}, {395, 9}, {75, 9},  {331, 9}, {203, 9}, {459, 9},
+      {43, 9},  {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+      {491, 9}, {27, 9},  {283, 9}, {155, 9}, {411, 9}, {91, 9},  {347, 9},
+      {219, 9}, {475, 9}, {59, 9},  {315, 9}, {187, 9}, {443, 9}, {123, 9},
+      {379, 9}, {251, 9}, {507, 9}, {7, 9},   {263, 9}, {135, 9}, {391, 9},
+      {71, 9},  {327, 9}, {199, 9}, {455, 9}, {39, 9},  {295, 9}, {167, 9},
+      {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9},  {279, 9},
+      {151, 9}, {407, 9}, {87, 9},  {343, 9}, {215, 9}, {471, 9}, {55, 9},
+      {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+      {15, 9},  {271, 9}, {143, 9}, {399, 9}, {79, 9},  {335, 9}, {207, 9},
+      {463, 9}, {47, 9},  {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+      {239, 9}, {495, 9}, {31, 9},  {287, 9}, {159, 9}, {415, 9}, {95, 9},
+      {351, 9}, {223, 9}, {479, 9}, {63, 9},  {319, 9}, {191, 9}, {447, 9},
+      {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7},   {64, 7},  {32, 7},
+      {96, 7},  {16, 7},  {80, 7},  {48, 7},  {112, 7}, {8, 7},   {72, 7},
+      {40, 7},  {104, 7}, {24, 7},  {88, 7},  {56, 7},  {120, 7}, {4, 7},
+      {68, 7},  {36, 7},  {100, 7}, {20, 7},  {84, 7},  {52, 7},  {116, 7},
+      {3, 8},   {131, 8}, {67, 8},  {195, 8}, {35, 8},  {163, 8}, {99, 8},
+      {227, 8},
+  };
+  return static_ltree[ch].len;
+}
+
+int GetHuffRunLen(int len, int initial_dist) {
+  int lc;
+  unsigned code;
+  int extra;
+  int dist;
+  int local_lbits, local_llen;
+  int local_dbits, local_dlen;
+  local_lbits = 0;
+  local_llen = 0;
+
+  int base_length[kLengthCodes] = {
+      0,  1,  2,  3,  4,  5,  6,  7,  8,   10,  12,  14,  16,  20, 24,
+      28, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 0,
+  };
+
+  int extra_lbits[kLengthCodes]  // extra bits for each length code
+      = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
+         2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0};
+
+  CtData static_ltree[kLCodes + 2] = {
+      {12, 8},  {140, 8}, {76, 8},  {204, 8}, {44, 8},  {172, 8}, {108, 8},
+      {236, 8}, {28, 8},  {156, 8}, {92, 8},  {220, 8}, {60, 8},  {188, 8},
+      {124, 8}, {252, 8}, {2, 8},   {130, 8}, {66, 8},  {194, 8}, {34, 8},
+      {162, 8}, {98, 8},  {226, 8}, {18, 8},  {146, 8}, {82, 8},  {210, 8},
+      {50, 8},  {178, 8}, {114, 8}, {242, 8}, {10, 8},  {138, 8}, {74, 8},
+      {202, 8}, {42, 8},  {170, 8}, {106, 8}, {234, 8}, {26, 8},  {154, 8},
+      {90, 8},  {218, 8}, {58, 8},  {186, 8}, {122, 8}, {250, 8}, {6, 8},
+      {134, 8}, {70, 8},  {198, 8}, {38, 8},  {166, 8}, {102, 8}, {230, 8},
+      {22, 8},  {150, 8}, {86, 8},  {214, 8}, {54, 8},  {182, 8}, {118, 8},
+      {246, 8}, {14, 8},  {142, 8}, {78, 8},  {206, 8}, {46, 8},  {174, 8},
+      {110, 8}, {238, 8}, {30, 8},  {158, 8}, {94, 8},  {222, 8}, {62, 8},
+      {190, 8}, {126, 8}, {254, 8}, {1, 8},   {129, 8}, {65, 8},  {193, 8},
+      {33, 8},  {161, 8}, {97, 8},  {225, 8}, {17, 8},  {145, 8}, {81, 8},
+      {209, 8}, {49, 8},  {177, 8}, {113, 8}, {241, 8}, {9, 8},   {137, 8},
+      {73, 8},  {201, 8}, {41, 8},  {169, 8}, {105, 8}, {233, 8}, {25, 8},
+      {153, 8}, {89, 8},  {217, 8}, {57, 8},  {185, 8}, {121, 8}, {249, 8},
+      {5, 8},   {133, 8}, {69, 8},  {197, 8}, {37, 8},  {165, 8}, {101, 8},
+      {229, 8}, {21, 8},  {149, 8}, {85, 8},  {213, 8}, {53, 8},  {181, 8},
+      {117, 8}, {245, 8}, {13, 8},  {141, 8}, {77, 8},  {205, 8}, {45, 8},
+      {173, 8}, {109, 8}, {237, 8}, {29, 8},  {157, 8}, {93, 8},  {221, 8},
+      {61, 8},  {189, 8}, {125, 8}, {253, 8}, {19, 9},  {275, 9}, {147, 9},
+      {403, 9}, {83, 9},  {339, 9}, {211, 9}, {467, 9}, {51, 9},  {307, 9},
+      {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+      {267, 9}, {139, 9}, {395, 9}, {75, 9},  {331, 9}, {203, 9}, {459, 9},
+      {43, 9},  {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+      {491, 9}, {27, 9},  {283, 9}, {155, 9}, {411, 9}, {91, 9},  {347, 9},
+      {219, 9}, {475, 9}, {59, 9},  {315, 9}, {187, 9}, {443, 9}, {123, 9},
+      {379, 9}, {251, 9}, {507, 9}, {7, 9},   {263, 9}, {135, 9}, {391, 9},
+      {71, 9},  {327, 9}, {199, 9}, {455, 9}, {39, 9},  {295, 9}, {167, 9},
+      {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9},  {279, 9},
+      {151, 9}, {407, 9}, {87, 9},  {343, 9}, {215, 9}, {471, 9}, {55, 9},
+      {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+      {15, 9},  {271, 9}, {143, 9}, {399, 9}, {79, 9},  {335, 9}, {207, 9},
+      {463, 9}, {47, 9},  {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+      {239, 9}, {495, 9}, {31, 9},  {287, 9}, {159, 9}, {415, 9}, {95, 9},
+      {351, 9}, {223, 9}, {479, 9}, {63, 9},  {319, 9}, {191, 9}, {447, 9},
+      {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7},   {64, 7},  {32, 7},
+      {96, 7},  {16, 7},  {80, 7},  {48, 7},  {112, 7}, {8, 7},   {72, 7},
+      {40, 7},  {104, 7}, {24, 7},  {88, 7},  {56, 7},  {120, 7}, {4, 7},
+      {68, 7},  {36, 7},  {100, 7}, {20, 7},  {84, 7},  {52, 7},  {116, 7},
+      {3, 8},   {131, 8}, {67, 8},  {195, 8}, {35, 8},  {163, 8}, {99, 8},
+      {227, 8},
+  };
+
+  // distance codes. The first 256 values correspond to the distances
+  // 3 .. 258, the last 256 values correspond to the top 8 bits of
+  // the 15 bit distances.
+  unsigned char dist_code[512] = {
+      0,  1,  2,  3,  4,  4,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  8,  8,
+      8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  10, 10, 10, 10,
+      10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
+      11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12,
+      12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+      12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+      13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 0,  0,  16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21,
+      21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+      25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
+      26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+      26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+      27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+      27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+      28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+      28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+      28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29,
+      29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+      29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+      29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+      29, 29, 29, 29, 29, 29, 29, 29,
+  };
+  // length code for each normalized match length (0 == kMinMatch)
+  unsigned char length_code[kMaxMatch - kMinMatch + 1] = {
+      0,  1,  2,  3,  4,  5,  6,  7,  8,  8,  9,  9,  10, 10, 11, 11, 12, 12,
+      12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16,
+      16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18,
+      18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20,
+      20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+      21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+      22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+      23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+      25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+      25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26,
+      26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+      26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+      27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+      27, 27, 27, 28,
+  };
+
+  int extra_dbits[kDCodes]  // extra bits for each distance code
+      = {0, 0, 0, 0, 1, 1, 2, 2,  3,  3,  4,  4,  5,  5,  6,
+         6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
+
+  int base_dist[kDCodes] = {
+      0,    1,    2,    3,    4,    6,    8,    12,    16,    24,
+      32,   48,   64,   96,   128,  192,  256,  384,   512,   768,
+      1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576,
+  };
+
+  CtData static_dtree[kDCodes] = {
+      {0, 5}, {16, 5}, {8, 5},  {24, 5}, {4, 5}, {20, 5}, {12, 5}, {28, 5},
+      {2, 5}, {18, 5}, {10, 5}, {26, 5}, {6, 5}, {22, 5}, {14, 5}, {30, 5},
+      {1, 5}, {17, 5}, {9, 5},  {25, 5}, {5, 5}, {21, 5}, {13, 5}, {29, 5},
+      {3, 5}, {19, 5}, {11, 5}, {27, 5}, {7, 5}, {23, 5},
+  };
+
+  lc = len - kMinMatch;
+  code = length_code[lc];
+
+  local_lbits = static_ltree[code + kLiterals + 1].code;
+  local_llen = static_ltree[code + kLiterals + 1].len;
+  extra = extra_lbits[code];
+  if (extra) {
+    lc -= base_length[code];
+    local_lbits |= lc << local_llen;
+    local_llen += extra;
+  }
+
+  dist = initial_dist;
+  dist--;
+  code = d_code(dist);
+  local_dbits = static_dtree[code].code;
+  local_dlen = static_dtree[code].len;
+  extra = extra_dbits[code];
+  if (extra) {
+    dist -= base_dist[code];
+    local_dbits |= dist << local_dlen;
+    local_dlen += extra;
+  }
+
+  local_lbits |= local_dbits << local_llen;
+  local_llen += local_dlen;
+
+  return local_llen;
+}
+
+int GetHuffRunBits(int len, int initial_dist) {
+  int lc;
+  unsigned code;
+  int extra;
+  int dist;
+  int local_lbits, local_llen;
+  int local_dbits, local_dlen;
+  local_lbits = 0;
+  local_llen = 0;
+
+  int base_length[kLengthCodes] = {
+      0,  1,  2,  3,  4,  5,  6,  7,  8,   10,  12,  14,  16,  20, 24,
+      28, 32, 40, 48, 56, 64, 80, 96, 112, 128, 160, 192, 224, 0,
+  };
+
+  int extra_lbits[kLengthCodes]  // extra bits for each length code
+      = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
+         2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0};
+
+  CtData static_ltree[kLCodes + 2] = {
+      {12, 8},  {140, 8}, {76, 8},  {204, 8}, {44, 8},  {172, 8}, {108, 8},
+      {236, 8}, {28, 8},  {156, 8}, {92, 8},  {220, 8}, {60, 8},  {188, 8},
+      {124, 8}, {252, 8}, {2, 8},   {130, 8}, {66, 8},  {194, 8}, {34, 8},
+      {162, 8}, {98, 8},  {226, 8}, {18, 8},  {146, 8}, {82, 8},  {210, 8},
+      {50, 8},  {178, 8}, {114, 8}, {242, 8}, {10, 8},  {138, 8}, {74, 8},
+      {202, 8}, {42, 8},  {170, 8}, {106, 8}, {234, 8}, {26, 8},  {154, 8},
+      {90, 8},  {218, 8}, {58, 8},  {186, 8}, {122, 8}, {250, 8}, {6, 8},
+      {134, 8}, {70, 8},  {198, 8}, {38, 8},  {166, 8}, {102, 8}, {230, 8},
+      {22, 8},  {150, 8}, {86, 8},  {214, 8}, {54, 8},  {182, 8}, {118, 8},
+      {246, 8}, {14, 8},  {142, 8}, {78, 8},  {206, 8}, {46, 8},  {174, 8},
+      {110, 8}, {238, 8}, {30, 8},  {158, 8}, {94, 8},  {222, 8}, {62, 8},
+      {190, 8}, {126, 8}, {254, 8}, {1, 8},   {129, 8}, {65, 8},  {193, 8},
+      {33, 8},  {161, 8}, {97, 8},  {225, 8}, {17, 8},  {145, 8}, {81, 8},
+      {209, 8}, {49, 8},  {177, 8}, {113, 8}, {241, 8}, {9, 8},   {137, 8},
+      {73, 8},  {201, 8}, {41, 8},  {169, 8}, {105, 8}, {233, 8}, {25, 8},
+      {153, 8}, {89, 8},  {217, 8}, {57, 8},  {185, 8}, {121, 8}, {249, 8},
+      {5, 8},   {133, 8}, {69, 8},  {197, 8}, {37, 8},  {165, 8}, {101, 8},
+      {229, 8}, {21, 8},  {149, 8}, {85, 8},  {213, 8}, {53, 8},  {181, 8},
+      {117, 8}, {245, 8}, {13, 8},  {141, 8}, {77, 8},  {205, 8}, {45, 8},
+      {173, 8}, {109, 8}, {237, 8}, {29, 8},  {157, 8}, {93, 8},  {221, 8},
+      {61, 8},  {189, 8}, {125, 8}, {253, 8}, {19, 9},  {275, 9}, {147, 9},
+      {403, 9}, {83, 9},  {339, 9}, {211, 9}, {467, 9}, {51, 9},  {307, 9},
+      {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+      {267, 9}, {139, 9}, {395, 9}, {75, 9},  {331, 9}, {203, 9}, {459, 9},
+      {43, 9},  {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+      {491, 9}, {27, 9},  {283, 9}, {155, 9}, {411, 9}, {91, 9},  {347, 9},
+      {219, 9}, {475, 9}, {59, 9},  {315, 9}, {187, 9}, {443, 9}, {123, 9},
+      {379, 9}, {251, 9}, {507, 9}, {7, 9},   {263, 9}, {135, 9}, {391, 9},
+      {71, 9},  {327, 9}, {199, 9}, {455, 9}, {39, 9},  {295, 9}, {167, 9},
+      {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9},  {279, 9},
+      {151, 9}, {407, 9}, {87, 9},  {343, 9}, {215, 9}, {471, 9}, {55, 9},
+      {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+      {15, 9},  {271, 9}, {143, 9}, {399, 9}, {79, 9},  {335, 9}, {207, 9},
+      {463, 9}, {47, 9},  {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+      {239, 9}, {495, 9}, {31, 9},  {287, 9}, {159, 9}, {415, 9}, {95, 9},
+      {351, 9}, {223, 9}, {479, 9}, {63, 9},  {319, 9}, {191, 9}, {447, 9},
+      {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7},   {64, 7},  {32, 7},
+      {96, 7},  {16, 7},  {80, 7},  {48, 7},  {112, 7}, {8, 7},   {72, 7},
+      {40, 7},  {104, 7}, {24, 7},  {88, 7},  {56, 7},  {120, 7}, {4, 7},
+      {68, 7},  {36, 7},  {100, 7}, {20, 7},  {84, 7},  {52, 7},  {116, 7},
+      {3, 8},   {131, 8}, {67, 8},  {195, 8}, {35, 8},  {163, 8}, {99, 8},
+      {227, 8},
+  };
+
+  // distance codes. The first 256 values correspond to the distances
+  // 3 .. 258, the last 256 values correspond to the top 8 bits of
+  // the 15 bit distances.
+  unsigned char dist_code[512] = {
+      0,  1,  2,  3,  4,  4,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  8,  8,
+      8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  10, 10, 10, 10,
+      10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
+      11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12,
+      12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+      12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+      13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+      14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+      15, 15, 15, 15, 0,  0,  16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21,
+      21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+      25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
+      26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+      26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+      27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+      27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+      28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+      28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+      28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 29, 29,
+      29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+      29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+      29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+      29, 29, 29, 29, 29, 29, 29, 29,
+  };
+  // length code for each normalized match length (0 == kMinMatch)
+  unsigned char length_code[kMaxMatch - kMinMatch + 1] = {
+      0,  1,  2,  3,  4,  5,  6,  7,  8,  8,  9,  9,  10, 10, 11, 11, 12, 12,
+      12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16,
+      16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18,
+      18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20,
+      20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+      21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+      22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+      23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+      24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+      25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+      25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26,
+      26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+      26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+      27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+      27, 27, 27, 28,
+  };
+
+  int extra_dbits[kDCodes]  // extra bits for each distance code
+      = {0, 0, 0, 0, 1, 1, 2, 2,  3,  3,  4,  4,  5,  5,  6,
+         6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
+
+  int base_dist[kDCodes] = {
+      0,    1,    2,    3,    4,    6,    8,    12,    16,    24,
+      32,   48,   64,   96,   128,  192,  256,  384,   512,   768,
+      1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576,
+  };
+
+  CtData static_dtree[kDCodes] = {
+      {0, 5}, {16, 5}, {8, 5},  {24, 5}, {4, 5}, {20, 5}, {12, 5}, {28, 5},
+      {2, 5}, {18, 5}, {10, 5}, {26, 5}, {6, 5}, {22, 5}, {14, 5}, {30, 5},
+      {1, 5}, {17, 5}, {9, 5},  {25, 5}, {5, 5}, {21, 5}, {13, 5}, {29, 5},
+      {3, 5}, {19, 5}, {11, 5}, {27, 5}, {7, 5}, {23, 5},
+  };
+
+  lc = len - kMinMatch;
+  code = length_code[lc];
+
+  local_lbits = static_ltree[code + kLiterals + 1].code;
+  local_llen = static_ltree[code + kLiterals + 1].len;
+  extra = extra_lbits[code];
+  if (extra) {
+    lc -= base_length[code];
+    local_lbits |= lc << local_llen;
+    local_llen += extra;
+  }
+
+  dist = initial_dist;
+  dist--;
+  code = d_code(dist);
+  local_dbits = static_dtree[code].code;
+  local_dlen = static_dtree[code].len;
+  extra = extra_dbits[code];
+  if (extra) {
+    dist -= base_dist[code];
+    local_dbits |= dist << local_dlen;
+    local_dlen += extra;
+  }
+
+  local_lbits |= local_dbits << local_llen;
+  local_llen += local_dlen;
+
+  return local_lbits;
+}
+
+int GetHuffLen(int len, int dist, unsigned char ch) {
+  int returned_len;
+
+  CtData static_ltree[kLCodes + 2] = {
+      {12, 8},  {140, 8}, {76, 8},  {204, 8}, {44, 8},  {172, 8}, {108, 8},
+      {236, 8}, {28, 8},  {156, 8}, {92, 8},  {220, 8}, {60, 8},  {188, 8},
+      {124, 8}, {252, 8}, {2, 8},   {130, 8}, {66, 8},  {194, 8}, {34, 8},
+      {162, 8}, {98, 8},  {226, 8}, {18, 8},  {146, 8}, {82, 8},  {210, 8},
+      {50, 8},  {178, 8}, {114, 8}, {242, 8}, {10, 8},  {138, 8}, {74, 8},
+      {202, 8}, {42, 8},  {170, 8}, {106, 8}, {234, 8}, {26, 8},  {154, 8},
+      {90, 8},  {218, 8}, {58, 8},  {186, 8}, {122, 8}, {250, 8}, {6, 8},
+      {134, 8}, {70, 8},  {198, 8}, {38, 8},  {166, 8}, {102, 8}, {230, 8},
+      {22, 8},  {150, 8}, {86, 8},  {214, 8}, {54, 8},  {182, 8}, {118, 8},
+      {246, 8}, {14, 8},  {142, 8}, {78, 8},  {206, 8}, {46, 8},  {174, 8},
+      {110, 8}, {238, 8}, {30, 8},  {158, 8}, {94, 8},  {222, 8}, {62, 8},
+      {190, 8}, {126, 8}, {254, 8}, {1, 8},   {129, 8}, {65, 8},  {193, 8},
+      {33, 8},  {161, 8}, {97, 8},  {225, 8}, {17, 8},  {145, 8}, {81, 8},
+      {209, 8}, {49, 8},  {177, 8}, {113, 8}, {241, 8}, {9, 8},   {137, 8},
+      {73, 8},  {201, 8}, {41, 8},  {169, 8}, {105, 8}, {233, 8}, {25, 8},
+      {153, 8}, {89, 8},  {217, 8}, {57, 8},  {185, 8}, {121, 8}, {249, 8},
+      {5, 8},   {133, 8}, {69, 8},  {197, 8}, {37, 8},  {165, 8}, {101, 8},
+      {229, 8}, {21, 8},  {149, 8}, {85, 8},  {213, 8}, {53, 8},  {181, 8},
+      {117, 8}, {245, 8}, {13, 8},  {141, 8}, {77, 8},  {205, 8}, {45, 8},
+      {173, 8}, {109, 8}, {237, 8}, {29, 8},  {157, 8}, {93, 8},  {221, 8},
+      {61, 8},  {189, 8}, {125, 8}, {253, 8}, {19, 9},  {275, 9}, {147, 9},
+      {403, 9}, {83, 9},  {339, 9}, {211, 9}, {467, 9}, {51, 9},  {307, 9},
+      {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+      {267, 9}, {139, 9}, {395, 9}, {75, 9},  {331, 9}, {203, 9}, {459, 9},
+      {43, 9},  {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+      {491, 9}, {27, 9},  {283, 9}, {155, 9}, {411, 9}, {91, 9},  {347, 9},
+      {219, 9}, {475, 9}, {59, 9},  {315, 9}, {187, 9}, {443, 9}, {123, 9},
+      {379, 9}, {251, 9}, {507, 9}, {7, 9},   {263, 9}, {135, 9}, {391, 9},
+      {71, 9},  {327, 9}, {199, 9}, {455, 9}, {39, 9},  {295, 9}, {167, 9},
+      {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9},  {279, 9},
+      {151, 9}, {407, 9}, {87, 9},  {343, 9}, {215, 9}, {471, 9}, {55, 9},
+      {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+      {15, 9},  {271, 9}, {143, 9}, {399, 9}, {79, 9},  {335, 9}, {207, 9},
+      {463, 9}, {47, 9},  {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+      {239, 9}, {495, 9}, {31, 9},  {287, 9}, {159, 9}, {415, 9}, {95, 9},
+      {351, 9}, {223, 9}, {479, 9}, {63, 9},  {319, 9}, {191, 9}, {447, 9},
+      {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7},   {64, 7},  {32, 7},
+      {96, 7},  {16, 7},  {80, 7},  {48, 7},  {112, 7}, {8, 7},   {72, 7},
+      {40, 7},  {104, 7}, {24, 7},  {88, 7},  {56, 7},  {120, 7}, {4, 7},
+      {68, 7},  {36, 7},  {100, 7}, {20, 7},  {84, 7},  {52, 7},  {116, 7},
+      {3, 8},   {131, 8}, {67, 8},  {195, 8}, {35, 8},  {163, 8}, {99, 8},
+      {227, 8},
+  };
+  switch (len) {
+    case -3:
+      returned_len = static_ltree[kEndBlock].len;
+      break;
+    case -2:
+      returned_len = 3;
+      break;
+    case -1:
+      returned_len = 0;
+      break;
+    case 0:
+      returned_len = GetHuffLiteralLen(ch);
+      break;
+    default:
+      returned_len = GetHuffRunLen(len, dist);
+      break;
+  }
+  return returned_len;
+}
+
+int IsValid(int len, int dist, unsigned char ch) {
+  switch (len) {
+    case -3:
+      return 1;
+    case -2:
+      return 1;
+    case -1:
+      return 0;
+    case 0:
+      return 1;
+    default:
+      return 1;
+  }
+}
+
+int GetHuffBits(int len, int dist, unsigned char ch) {
+  int bits;
+  CtData static_ltree[kLCodes + 2] = {
+      {12, 8},  {140, 8}, {76, 8},  {204, 8}, {44, 8},  {172, 8}, {108, 8},
+      {236, 8}, {28, 8},  {156, 8}, {92, 8},  {220, 8}, {60, 8},  {188, 8},
+      {124, 8}, {252, 8}, {2, 8},   {130, 8}, {66, 8},  {194, 8}, {34, 8},
+      {162, 8}, {98, 8},  {226, 8}, {18, 8},  {146, 8}, {82, 8},  {210, 8},
+      {50, 8},  {178, 8}, {114, 8}, {242, 8}, {10, 8},  {138, 8}, {74, 8},
+      {202, 8}, {42, 8},  {170, 8}, {106, 8}, {234, 8}, {26, 8},  {154, 8},
+      {90, 8},  {218, 8}, {58, 8},  {186, 8}, {122, 8}, {250, 8}, {6, 8},
+      {134, 8}, {70, 8},  {198, 8}, {38, 8},  {166, 8}, {102, 8}, {230, 8},
+      {22, 8},  {150, 8}, {86, 8},  {214, 8}, {54, 8},  {182, 8}, {118, 8},
+      {246, 8}, {14, 8},  {142, 8}, {78, 8},  {206, 8}, {46, 8},  {174, 8},
+      {110, 8}, {238, 8}, {30, 8},  {158, 8}, {94, 8},  {222, 8}, {62, 8},
+      {190, 8}, {126, 8}, {254, 8}, {1, 8},   {129, 8}, {65, 8},  {193, 8},
+      {33, 8},  {161, 8}, {97, 8},  {225, 8}, {17, 8},  {145, 8}, {81, 8},
+      {209, 8}, {49, 8},  {177, 8}, {113, 8}, {241, 8}, {9, 8},   {137, 8},
+      {73, 8},  {201, 8}, {41, 8},  {169, 8}, {105, 8}, {233, 8}, {25, 8},
+      {153, 8}, {89, 8},  {217, 8}, {57, 8},  {185, 8}, {121, 8}, {249, 8},
+      {5, 8},   {133, 8}, {69, 8},  {197, 8}, {37, 8},  {165, 8}, {101, 8},
+      {229, 8}, {21, 8},  {149, 8}, {85, 8},  {213, 8}, {53, 8},  {181, 8},
+      {117, 8}, {245, 8}, {13, 8},  {141, 8}, {77, 8},  {205, 8}, {45, 8},
+      {173, 8}, {109, 8}, {237, 8}, {29, 8},  {157, 8}, {93, 8},  {221, 8},
+      {61, 8},  {189, 8}, {125, 8}, {253, 8}, {19, 9},  {275, 9}, {147, 9},
+      {403, 9}, {83, 9},  {339, 9}, {211, 9}, {467, 9}, {51, 9},  {307, 9},
+      {179, 9}, {435, 9}, {115, 9}, {371, 9}, {243, 9}, {499, 9}, {11, 9},
+      {267, 9}, {139, 9}, {395, 9}, {75, 9},  {331, 9}, {203, 9}, {459, 9},
+      {43, 9},  {299, 9}, {171, 9}, {427, 9}, {107, 9}, {363, 9}, {235, 9},
+      {491, 9}, {27, 9},  {283, 9}, {155, 9}, {411, 9}, {91, 9},  {347, 9},
+      {219, 9}, {475, 9}, {59, 9},  {315, 9}, {187, 9}, {443, 9}, {123, 9},
+      {379, 9}, {251, 9}, {507, 9}, {7, 9},   {263, 9}, {135, 9}, {391, 9},
+      {71, 9},  {327, 9}, {199, 9}, {455, 9}, {39, 9},  {295, 9}, {167, 9},
+      {423, 9}, {103, 9}, {359, 9}, {231, 9}, {487, 9}, {23, 9},  {279, 9},
+      {151, 9}, {407, 9}, {87, 9},  {343, 9}, {215, 9}, {471, 9}, {55, 9},
+      {311, 9}, {183, 9}, {439, 9}, {119, 9}, {375, 9}, {247, 9}, {503, 9},
+      {15, 9},  {271, 9}, {143, 9}, {399, 9}, {79, 9},  {335, 9}, {207, 9},
+      {463, 9}, {47, 9},  {303, 9}, {175, 9}, {431, 9}, {111, 9}, {367, 9},
+      {239, 9}, {495, 9}, {31, 9},  {287, 9}, {159, 9}, {415, 9}, {95, 9},
+      {351, 9}, {223, 9}, {479, 9}, {63, 9},  {319, 9}, {191, 9}, {447, 9},
+      {127, 9}, {383, 9}, {255, 9}, {511, 9}, {0, 7},   {64, 7},  {32, 7},
+      {96, 7},  {16, 7},  {80, 7},  {48, 7},  {112, 7}, {8, 7},   {72, 7},
+      {40, 7},  {104, 7}, {24, 7},  {88, 7},  {56, 7},  {120, 7}, {4, 7},
+      {68, 7},  {36, 7},  {100, 7}, {20, 7},  {84, 7},  {52, 7},  {116, 7},
+      {3, 8},   {131, 8}, {67, 8},  {195, 8}, {35, 8},  {163, 8}, {99, 8},
+      {227, 8},
+  };
+  switch (len) {
+    case -3:
+      bits = static_ltree[kEndBlock].code;
+      break;
+    case -2:
+      bits = ch;
+      break;
+    case -1:
+      bits = 0;
+      break;
+    case 0:
+      bits = GetHuffLiteralBits(ch);
+      break;
+    default:
+      bits = GetHuffRunBits(len, dist);
+      break;
+  }
+  return bits;
+}
+
+// assembles up to kVecX2 unsigned char values based on given huffman encoding
+// writes up to kMaxHuffcodeBits * kVecX2 bits to memory
+bool HufEnc(char *len, short *dist, unsigned char *data, unsigned int *outdata,
+            unsigned int *leftover, unsigned short *leftover_size) {
+  // array that contains the bit position of each symbol
+  unsigned short bitpos[kVec + 1];
+  bitpos[0] = 0;
+
+  Unroller<0, kVec>::step([&](int i) {
+    bitpos[i + 1] = bitpos[i] + (IsValid(len[i], dist[i], data[i])
+                                     ? GetHuffLen(len[i], dist[i], data[i])
+                                     : 0);
+  });
+
+  // leftover is an array that carries huffman encoded data not yet written to
+  // memory adjust leftover_size with the number of bits to write this time
+  unsigned short prev_cycle_offset = *leftover_size;
+  *leftover_size += (bitpos[kVec] & 0x3fff);
+
+  // we'll write this cycle if we have collected enough data (kVec shorts or
+  // more)
+  bool write = *leftover_size & (kVec * (kMaxHuffcodeBits * 2));
+
+  // subtract kVec shorts from leftover size (if it's bigger
+  // than kVec) because we'll write those out this cycle
+  *leftover_size &= ~(kVec * (kMaxHuffcodeBits * 2));
+
+  // Adjust bitpos based on leftover offset from previous cycle
+  Unroller<0, kVec>::step(
+      [&](int i) { bitpos[i] += (prev_cycle_offset & 0x3fff); });
+
+  // Huffman codes have any bit alignement, so they can spill
+  // onto two shorts in the output array
+  // use ushort2 to keep each part of the code separate
+  // Iterate over all codes and construct ushort2 containing
+  // the code properly aligned
+  struct Uint2Gzip code[kVec];
+  Unroller<0, kVec>::step([&](int i) {
+    code[i].x = 0;
+    code[i].y = 0;
+  });
+
+  Unroller<0, kVec>::step([&](int i) {
+    // Codes can be more than 16 bits, so use uint32
+    unsigned int curr_code = GetHuffBits(len[i], dist[i], data[i]);
+    unsigned char bitpos_in_short = bitpos[i] & 0x01F;
+
+    unsigned long long temp = (unsigned long long)curr_code << bitpos_in_short;
+    unsigned int temp1 = (unsigned int)temp;
+    unsigned int temp2 = temp >> 32ULL;
+
+    if (IsValid(len[i], dist[i], data[i])) {
+      code[i].x = temp1;
+      code[i].y = temp2;
+    } else {
+      code[i].x = temp1;
+      code[i].y = temp2;
+    }
+  });
+
+  // Iterate over all destination locations and gather the required data
+  unsigned int new_leftover[kVec];
+  Unroller<0, kVec>::step([&](int i) {
+    new_leftover[i] = 0;
+    outdata[i] = 0;
+
+    Unroller<0, kVec>::step([&](int j) {
+      // figure out whether code[j] goes into bucket[i]
+      bool match_first = ((bitpos[j] >> 5) & (kVec - 1)) == i;
+      bool match_second =
+          ((bitpos[j] >> 5) & (kVec - 1)) == ((i - 1) & (kVec - 1));
+
+      // if code[j] maps onto current bucket then OR its code, else OR with 0
+      unsigned int component =
+          match_first ? code[j].x : (match_second ? code[j].y : 0);
+
+      // overflow from kVec shorts, need to move onto new_leftover
+      bool use_later =
+          (bitpos[j] & (kVec * (kMaxHuffcodeBits * 2))) ||
+          (match_second && (((bitpos[j] >> 5) & (kVec - 1)) == kVec - 1));
+
+      // write to output
+      outdata[i] |= use_later ? 0 : component;
+      new_leftover[i] |= use_later ? component : 0;
+    });
+  });
+
+  // Apply previous leftover on the outdata
+  // Also, if didn't write, apply prev leftover onto newleftover
+  Unroller<0, kVec>::step([&](int i) {
+    outdata[i] |= leftover[i];
+    leftover[i] = outdata[i];
+  });
+
+  // Split unroll into two unrolls to avoid compiler crash. This is a temporary
+  // workaround while awaiting a compiler feature.
+  if (write) {
+    Unroller<0, kVec>::step([&](int i) { leftover[i] = new_leftover[i]; });
+  }
+
+  return write;
+}
+
+template <int engineID>
+class CRC;
+template <int engineID>
+class LZReduction;
+template <int engineID>
+class StaticHuffman;
+template <int engineID>
+void SubmitGzipTasksSingleEngine(
+    queue &q,
+    size_t block_size,  // size of block to compress.
+    buffer<char, 1> *pibuf, buffer<char, 1> *pobuf,
+    buffer<struct GzipOutInfo, 1> *gzip_out_buf,
+    buffer<unsigned, 1> *result_crc, bool last_block, event &e_crc, event &e_lz,
+    event &e_huff) {
+  using acc_dist_channel = intel::pipe<class some_pipe, struct DistLen>;
+  using acc_dist_channel_last = intel::pipe<class some_pipe2, struct DistLen>;
+
+  e_crc = q.submit([&](handler &h) {
+    auto accessor_isz = block_size;
+    auto acc_pibuf = pibuf->get_access<access::mode::read>(h);
+    auto accresult_crc = result_crc->get_access<access::mode::discard_write>(h);
+    h.single_task<CRC<engineID>>([=]() [[intel::kernel_args_restrict]] {
+      const unsigned int table64[64][16] = {
+          {
+              0x0,
+              0xf1da05aa,
+              0x38c50d15,
+              0xc91f08bf,
+              0x718a1a2a,
+              0x80501f80,
+              0x494f173f,
+              0xb8951295,
+              0xe3143454,
+              0x12ce31fe,
+              0xdbd13941,
+              0x2a0b3ceb,
+              0x929e2e7e,
+              0x63442bd4,
+              0xaa5b236b,
+              0x5b8126c1,
+          },
+
+          {
+              0x0,
+              0x1d596ee9,
+              0x3ab2ddd2,
+              0x27ebb33b,
+              0x7565bba4,
+              0x683cd54d,
+              0x4fd76676,
+              0x528e089f,
+              0xeacb7748,
+              0xf79219a1,
+              0xd079aa9a,
+              0xcd20c473,
+              0x9faeccec,
+              0x82f7a205,
+              0xa51c113e,
+              0xb8457fd7,
+          },
+
+          {
+              0x0,
+              0xee7e8d1,
+              0x1dcfd1a2,
+              0x13283973,
+              0x3b9fa344,
+              0x35784b95,
+              0x265072e6,
+              0x28b79a37,
+              0x773f4688,
+              0x79d8ae59,
+              0x6af0972a,
+              0x64177ffb,
+              0x4ca0e5cc,
+              0x42470d1d,
+              0x516f346e,
+              0x5f88dcbf,
+          },
+
+          {
+              0x0,
+              0xee7e8d10,
+              0x78c1c61,
+              0xe9f29171,
+              0xf1838c2,
+              0xe166b5d2,
+              0x89424a3,
+              0xe6eaa9b3,
+              0x1e307184,
+              0xf04efc94,
+              0x19bc6de5,
+              0xf7c2e0f5,
+              0x11284946,
+              0xff56c456,
+              0x16a45527,
+              0xf8dad837,
+          },
+
+          {
+              0x0,
+              0x3c60e308,
+              0x78c1c610,
+              0x44a12518,
+              0xf1838c20,
+              0xcde36f28,
+              0x89424a30,
+              0xb522a938,
+              0x38761e01,
+              0x416fd09,
+              0x40b7d811,
+              0x7cd73b19,
+              0xc9f59221,
+              0xf5957129,
+              0xb1345431,
+              0x8d54b739,
+          },
+
+          {
+              0x0,
+              0x70ec3c02,
+              0xe1d87804,
+              0x91344406,
+              0x18c1f649,
+              0x682dca4b,
+              0xf9198e4d,
+              0x89f5b24f,
+              0x3183ec92,
+              0x416fd090,
+              0xd05b9496,
+              0xa0b7a894,
+              0x29421adb,
+              0x59ae26d9,
+              0xc89a62df,
+              0xb8765edd,
+          },
+
+          {
+              0x0,
+              0x6307d924,
+              0xc60fb248,
+              0xa5086b6c,
+              0x576e62d1,
+              0x3469bbf5,
+              0x9161d099,
+              0xf26609bd,
+              0xaedcc5a2,
+              0xcddb1c86,
+              0x68d377ea,
+              0xbd4aece,
+              0xf9b2a773,
+              0x9ab57e57,
+              0x3fbd153b,
+              0x5cbacc1f,
+          },
+
+          {
+              0x0,
+              0x86c88d05,
+              0xd6e01c4b,
+              0x5028914e,
+              0x76b13ed7,
+              0xf079b3d2,
+              0xa051229c,
+              0x2699af99,
+              0xed627dae,
+              0x6baaf0ab,
+              0x3b8261e5,
+              0xbd4aece0,
+              0x9bd34379,
+              0x1d1bce7c,
+              0x4d335f32,
+              0xcbfbd237,
+          },
+
+          {
+              0x0,
+              0x1b5fd1d,
+              0x36bfa3a,
+              0x2de0727,
+              0x6d7f474,
+              0x7620969,
+              0x5bc0e4e,
+              0x409f353,
+              0xdafe8e8,
+              0xc1a15f5,
+              0xec412d2,
+              0xf71efcf,
+              0xb781c9c,
+              0xacde181,
+              0x813e6a6,
+              0x9a61bbb,
+          },
+
+          {
+              0x0,
+              0x1b5fd1d0,
+              0x36bfa3a0,
+              0x2de07270,
+              0x6d7f4740,
+              0x76209690,
+              0x5bc0e4e0,
+              0x409f3530,
+              0xdafe8e80,
+              0xc1a15f50,
+              0xec412d20,
+              0xf71efcf0,
+              0xb781c9c0,
+              0xacde1810,
+              0x813e6a60,
+              0x9a61bbb0,
+          },
+
+          {
+              0x0,
+              0x6e8c1b41,
+              0xdd183682,
+              0xb3942dc3,
+              0x61416b45,
+              0xfcd7004,
+              0xbc595dc7,
+              0xd2d54686,
+              0xc282d68a,
+              0xac0ecdcb,
+              0x1f9ae008,
+              0x7116fb49,
+              0xa3c3bdcf,
+              0xcd4fa68e,
+              0x7edb8b4d,
+              0x1057900c,
+          },
+
+          {
+              0x0,
+              0x5e74ab55,
+              0xbce956aa,
+              0xe29dfdff,
+              0xa2a3ab15,
+              0xfcd70040,
+              0x1e4afdbf,
+              0x403e56ea,
+              0x9e36506b,
+              0xc042fb3e,
+              0x22df06c1,
+              0x7cabad94,
+              0x3c95fb7e,
+              0x62e1502b,
+              0x807cadd4,
+              0xde080681,
+          },
+
+          {
+              0x0,
+              0xe71da697,
+              0x154a4b6f,
+              0xf257edf8,
+              0x2a9496de,
+              0xcd893049,
+              0x3fdeddb1,
+              0xd8c37b26,
+              0x55292dbc,
+              0xb2348b2b,
+              0x406366d3,
+              0xa77ec044,
+              0x7fbdbb62,
+              0x98a01df5,
+              0x6af7f00d,
+              0x8dea569a,
+          },
+
+          {
+              0x0,
+              0xaa525b78,
+              0x8fd5b0b1,
+              0x2587ebc9,
+              0xc4da6723,
+              0x6e883c5b,
+              0x4b0fd792,
+              0xe15d8cea,
+              0x52c5c807,
+              0xf897937f,
+              0xdd1078b6,
+              0x774223ce,
+              0x961faf24,
+              0x3c4df45c,
+              0x19ca1f95,
+              0xb39844ed,
+          },
+
+          {
+              0x0,
+              0xa58b900e,
+              0x9066265d,
+              0x35edb653,
+              0xfbbd4afb,
+              0x5e36daf5,
+              0x6bdb6ca6,
+              0xce50fca8,
+              0x2c0b93b7,
+              0x898003b9,
+              0xbc6db5ea,
+              0x19e625e4,
+              0xd7b6d94c,
+              0x723d4942,
+              0x47d0ff11,
+              0xe25b6f1f,
+          },
+
+          {
+              0x0,
+              0x5817276e,
+              0xb02e4edc,
+              0xe83969b2,
+              0xbb2d9bf9,
+              0xe33abc97,
+              0xb03d525,
+              0x5314f24b,
+              0xad2a31b3,
+              0xf53d16dd,
+              0x1d047f6f,
+              0x45135801,
+              0x1607aa4a,
+              0x4e108d24,
+              0xa629e496,
+              0xfe3ec3f8,
+          },
+
+          {
+              0x0,
+              0x81256527,
+              0xd93bcc0f,
+              0x581ea928,
+              0x69069e5f,
+              0xe823fb78,
+              0xb03d5250,
+              0x31183777,
+              0xd20d3cbe,
+              0x53285999,
+              0xb36f0b1,
+              0x8a139596,
+              0xbb0ba2e1,
+              0x3a2ec7c6,
+              0x62306eee,
+              0xe3150bc9,
+          },
+
+          {
+              0x0,
+              0x7f6b7f3d,
+              0xfed6fe7a,
+              0x81bd8147,
+              0x26dcfab5,
+              0x59b78588,
+              0xd80a04cf,
+              0xa7617bf2,
+              0x4db9f56a,
+              0x32d28a57,
+              0xb36f0b10,
+              0xcc04742d,
+              0x6b650fdf,
+              0x140e70e2,
+              0x95b3f1a5,
+              0xead88e98,
+          },
+
+          {
+              0x0,
+              0x9b73ead4,
+              0xed96d3e9,
+              0x76e5393d,
+              0x5ca193,
+              0x9b2f4b47,
+              0xedca727a,
+              0x76b998ae,
+              0xb94326,
+              0x9bcaa9f2,
+              0xed2f90cf,
+              0x765c7a1b,
+              0xe5e2b5,
+              0x9b960861,
+              0xed73315c,
+              0x7600db88,
+          },
+
+          {
+              0x0,
+              0x172864c,
+              0x2e50c98,
+              0x3978ad4,
+              0x5ca1930,
+              0x4b89f7c,
+              0x72f15a8,
+              0x65d93e4,
+              0xb943260,
+              0xae6b42c,
+              0x9713ef8,
+              0x803b8b4,
+              0xe5e2b50,
+              0xf2cad1c,
+              0xcbb27c8,
+              0xdc9a184,
+          },
+
+          {
+              0x0,
+              0x172864c0,
+              0x2e50c980,
+              0x3978ad40,
+              0x5ca19300,
+              0x4b89f7c0,
+              0x72f15a80,
+              0x65d93e40,
+              0xb9432600,
+              0xae6b42c0,
+              0x9713ef80,
+              0x803b8b40,
+              0xe5e2b500,
+              0xf2cad1c0,
+              0xcbb27c80,
+              0xdc9a1840,
+          },
+
+          {
+              0x0,
+              0xa9f74a41,
+              0x889f92c3,
+              0x2168d882,
+              0xca4e23c7,
+              0x63b96986,
+              0x42d1b104,
+              0xeb26fb45,
+              0x4fed41cf,
+              0xe61a0b8e,
+              0xc772d30c,
+              0x6e85994d,
+              0x85a36208,
+              0x2c542849,
+              0xd3cf0cb,
+              0xa4cbba8a,
+          },
+
+          {
+              0x0,
+              0x9fda839e,
+              0xe4c4017d,
+              0x7b1e82e3,
+              0x12f904bb,
+              0x8d238725,
+              0xf63d05c6,
+              0x69e78658,
+              0x25f20976,
+              0xba288ae8,
+              0xc136080b,
+              0x5eec8b95,
+              0x370b0dcd,
+              0xa8d18e53,
+              0xd3cf0cb0,
+              0x4c158f2e,
+          },
+
+          {
+              0x0,
+              0x4be412ec,
+              0x97c825d8,
+              0xdc2c3734,
+              0xf4e14df1,
+              0xbf055f1d,
+              0x63296829,
+              0x28cd7ac5,
+              0x32b39da3,
+              0x79578f4f,
+              0xa57bb87b,
+              0xee9faa97,
+              0xc652d052,
+              0x8db6c2be,
+              0x519af58a,
+              0x1a7ee766,
+          },
+
+          {
+              0x0,
+              0x65673b46,
+              0xcace768c,
+              0xafa94dca,
+              0x4eedeb59,
+              0x2b8ad01f,
+              0x84239dd5,
+              0xe144a693,
+              0x9ddbd6b2,
+              0xf8bcedf4,
+              0x5715a03e,
+              0x32729b78,
+              0xd3363deb,
+              0xb65106ad,
+              0x19f84b67,
+              0x7c9f7021,
+          },
+
+          {
+              0x0,
+              0xe0c6ab25,
+              0x1afc500b,
+              0xfa3afb2e,
+              0x35f8a016,
+              0xd53e0b33,
+              0x2f04f01d,
+              0xcfc25b38,
+              0x6bf1402c,
+              0x8b37eb09,
+              0x710d1027,
+              0x91cbbb02,
+              0x5e09e03a,
+              0xbecf4b1f,
+              0x44f5b031,
+              0xa4331b14,
+          },
+
+          {
+              0x0,
+              0xd7e28058,
+              0x74b406f1,
+              0xa35686a9,
+              0xe9680de2,
+              0x3e8a8dba,
+              0x9ddc0b13,
+              0x4a3e8b4b,
+              0x9a11d85,
+              0xde439ddd,
+              0x7d151b74,
+              0xaaf79b2c,
+              0xe0c91067,
+              0x372b903f,
+              0x947d1696,
+              0x439f96ce,
+          },
+
+          {
+              0x0,
+              0x13423b0a,
+              0x26847614,
+              0x35c64d1e,
+              0x4d08ec28,
+              0x5e4ad722,
+              0x6b8c9a3c,
+              0x78cea136,
+              0x9a11d850,
+              0x8953e35a,
+              0xbc95ae44,
+              0xafd7954e,
+              0xd7193478,
+              0xc45b0f72,
+              0xf19d426c,
+              0xe2df7966,
+          },
+
+          {
+              0x0,
+              0xef52b6e1,
+              0x5d46b83,
+              0xea86dd62,
+              0xba8d706,
+              0xe4fa61e7,
+              0xe7cbc85,
+              0xe12e0a64,
+              0x1751ae0c,
+              0xf80318ed,
+              0x1285c58f,
+              0xfdd7736e,
+              0x1cf9790a,
+              0xf3abcfeb,
+              0x192d1289,
+              0xf67fa468,
+          },
+
+          {
+              0x0,
+              0x2ea35c18,
+              0x5d46b830,
+              0x73e5e428,
+              0xba8d7060,
+              0x942e2c78,
+              0xe7cbc850,
+              0xc9689448,
+              0xae6be681,
+              0x80c8ba99,
+              0xf32d5eb1,
+              0xdd8e02a9,
+              0x14e696e1,
+              0x3a45caf9,
+              0x49a02ed1,
+              0x670372c9,
+          },
+
+          {
+              0x0,
+              0x87a6cb43,
+              0xd43c90c7,
+              0x539a5b84,
+              0x730827cf,
+              0xf4aeec8c,
+              0xa734b708,
+              0x20927c4b,
+              0xe6104f9e,
+              0x61b684dd,
+              0x322cdf59,
+              0xb58a141a,
+              0x95186851,
+              0x12bea312,
+              0x4124f896,
+              0xc68233d5,
+          },
+
+          {
+              0x0,
+              0x1751997d,
+              0x2ea332fa,
+              0x39f2ab87,
+              0x5d4665f4,
+              0x4a17fc89,
+              0x73e5570e,
+              0x64b4ce73,
+              0xba8ccbe8,
+              0xaddd5295,
+              0x942ff912,
+              0x837e606f,
+              0xe7caae1c,
+              0xf09b3761,
+              0xc9699ce6,
+              0xde38059b,
+          },
+
+          {
+              0x0,
+              0xae689191,
+              0x87a02563,
+              0x29c8b4f2,
+              0xd4314c87,
+              0x7a59dd16,
+              0x539169e4,
+              0xfdf9f875,
+              0x73139f4f,
+              0xdd7b0ede,
+              0xf4b3ba2c,
+              0x5adb2bbd,
+              0xa722d3c8,
+              0x94a4259,
+              0x2082f6ab,
+              0x8eea673a,
+          },
+
+          {
+              0x0,
+              0xe6273e9e,
+              0x173f7b7d,
+              0xf11845e3,
+              0x2e7ef6fa,
+              0xc859c864,
+              0x39418d87,
+              0xdf66b319,
+              0x5cfdedf4,
+              0xbadad36a,
+              0x4bc29689,
+              0xade5a817,
+              0x72831b0e,
+              0x94a42590,
+              0x65bc6073,
+              0x839b5eed,
+          },
+
+          {
+              0x0,
+              0xb9fbdbe8,
+              0xa886b191,
+              0x117d6a79,
+              0x8a7c6563,
+              0x3387be8b,
+              0x22fad4f2,
+              0x9b010f1a,
+              0xcf89cc87,
+              0x7672176f,
+              0x670f7d16,
+              0xdef4a6fe,
+              0x45f5a9e4,
+              0xfc0e720c,
+              0xed731875,
+              0x5488c39d,
+          },
+
+          {
+              0x0,
+              0x44629f4f,
+              0x88c53e9e,
+              0xcca7a1d1,
+              0xcafb7b7d,
+              0x8e99e432,
+              0x423e45e3,
+              0x65cdaac,
+              0x4e87f0bb,
+              0xae56ff4,
+              0xc642ce25,
+              0x8220516a,
+              0x847c8bc6,
+              0xc01e1489,
+              0xcb9b558,
+              0x48db2a17,
+          },
+
+          {
+              0x0,
+              0x9d0fe176,
+              0xe16ec4ad,
+              0x7c6125db,
+              0x19ac8f1b,
+              0x84a36e6d,
+              0xf8c24bb6,
+              0x65cdaac0,
+              0x33591e36,
+              0xae56ff40,
+              0xd237da9b,
+              0x4f383bed,
+              0x2af5912d,
+              0xb7fa705b,
+              0xcb9b5580,
+              0x5694b4f6,
+          },
+
+          {
+              0x0,
+              0x66b23c6c,
+              0xcd6478d8,
+              0xabd644b4,
+              0x41b9f7f1,
+              0x270bcb9d,
+              0x8cdd8f29,
+              0xea6fb345,
+              0x8373efe2,
+              0xe5c1d38e,
+              0x4e17973a,
+              0x28a5ab56,
+              0xc2ca1813,
+              0xa478247f,
+              0xfae60cb,
+              0x691c5ca7,
+          },
+
+          {
+              0x0,
+              0xdd96d985,
+              0x605cb54b,
+              0xbdca6cce,
+              0xc0b96a96,
+              0x1d2fb313,
+              0xa0e5dfdd,
+              0x7d730658,
+              0x5a03d36d,
+              0x87950ae8,
+              0x3a5f6626,
+              0xe7c9bfa3,
+              0x9abab9fb,
+              0x472c607e,
+              0xfae60cb0,
+              0x2770d535,
+          },
+
+          {
+              0x0,
+              0xb407a6da,
+              0xb37e4bf5,
+              0x779ed2f,
+              0xbd8d91ab,
+              0x98a3771,
+              0xef3da5e,
+              0xbaf47c84,
+              0xa06a2517,
+              0x146d83cd,
+              0x13146ee2,
+              0xa713c838,
+              0x1de7b4bc,
+              0xa9e01266,
+              0xae99ff49,
+              0x1a9e5993,
+          },
+
+          {
+              0x0,
+              0x9ba54c6f,
+              0xec3b9e9f,
+              0x779ed2f0,
+              0x3063b7f,
+              0x98a37710,
+              0xef3da5e0,
+              0x7498e98f,
+              0x60c76fe,
+              0x9da93a91,
+              0xea37e861,
+              0x7192a40e,
+              0x50a4d81,
+              0x9eaf01ee,
+              0xe931d31e,
+              0x72949f71,
+          },
+
+          {
+              0x0,
+              0xc18edfc,
+              0x1831dbf8,
+              0x14293604,
+              0x3063b7f0,
+              0x3c7b5a0c,
+              0x28526c08,
+              0x244a81f4,
+              0x60c76fe0,
+              0x6cdf821c,
+              0x78f6b418,
+              0x74ee59e4,
+              0x50a4d810,
+              0x5cbc35ec,
+              0x489503e8,
+              0x448dee14,
+          },
+
+          {
+              0x0,
+              0xc18edfc0,
+              0x586cb9c1,
+              0x99e26601,
+              0xb0d97382,
+              0x7157ac42,
+              0xe8b5ca43,
+              0x293b1583,
+              0xbac3e145,
+              0x7b4d3e85,
+              0xe2af5884,
+              0x23218744,
+              0xa1a92c7,
+              0xcb944d07,
+              0x52762b06,
+              0x93f8f4c6,
+          },
+
+          {
+              0x0,
+              0xaef6c4cb,
+              0x869c8fd7,
+              0x286a4b1c,
+              0xd64819ef,
+              0x78bedd24,
+              0x50d49638,
+              0xfe2252f3,
+              0x77e1359f,
+              0xd917f154,
+              0xf17dba48,
+              0x5f8b7e83,
+              0xa1a92c70,
+              0xf5fe8bb,
+              0x2735a3a7,
+              0x89c3676c,
+          },
+
+          {
+              0x0,
+              0xefc26b3e,
+              0x4f5d03d,
+              0xeb37bb03,
+              0x9eba07a,
+              0xe629cb44,
+              0xd1e7047,
+              0xe2dc1b79,
+              0x13d740f4,
+              0xfc152bca,
+              0x172290c9,
+              0xf8e0fbf7,
+              0x1a3ce08e,
+              0xf5fe8bb0,
+              0x1ec930b3,
+              0xf10b5b8d,
+          },
+
+          {
+              0x0,
+              0x27ae81e8,
+              0x4f5d03d0,
+              0x68f38238,
+              0x9eba07a0,
+              0xb9148648,
+              0xd1e70470,
+              0xf6498598,
+              0xe6050901,
+              0xc1ab88e9,
+              0xa9580ad1,
+              0x8ef68b39,
+              0x78bf0ea1,
+              0x5f118f49,
+              0x37e20d71,
+              0x104c8c99,
+          },
+
+          {
+              0x0,
+              0x177b1443,
+              0x2ef62886,
+              0x398d3cc5,
+              0x5dec510c,
+              0x4a97454f,
+              0x731a798a,
+              0x64616dc9,
+              0xbbd8a218,
+              0xaca3b65b,
+              0x952e8a9e,
+              0x82559edd,
+              0xe634f314,
+              0xf14fe757,
+              0xc8c2db92,
+              0xdfb9cfd1,
+          },
+
+          {
+              0x0,
+              0xacc04271,
+              0x82f182a3,
+              0x2e31c0d2,
+              0xde920307,
+              0x72524176,
+              0x5c6381a4,
+              0xf0a3c3d5,
+              0x6655004f,
+              0xca95423e,
+              0xe4a482ec,
+              0x4864c09d,
+              0xb8c70348,
+              0x14074139,
+              0x3a3681eb,
+              0x96f6c39a,
+          },
+
+          {
+              0x0,
+              0xccaa009e,
+              0x4225077d,
+              0x8e8f07e3,
+              0x844a0efa,
+              0x48e00e64,
+              0xc66f0987,
+              0xac50919,
+              0xd3e51bb5,
+              0x1f4f1b2b,
+              0x91c01cc8,
+              0x5d6a1c56,
+              0x57af154f,
+              0x9b0515d1,
+              0x158a1232,
+              0xd92012ac,
+          },
+
+          {
+              0x0,
+              0x7cbb312b,
+              0xf9766256,
+              0x85cd537d,
+              0x299dc2ed,
+              0x5526f3c6,
+              0xd0eba0bb,
+              0xac509190,
+              0x533b85da,
+              0x2f80b4f1,
+              0xaa4de78c,
+              0xd6f6d6a7,
+              0x7aa64737,
+              0x61d761c,
+              0x83d02561,
+              0xff6b144a,
+          },
+
+          {
+              0x0,
+              0xa6770bb4,
+              0x979f1129,
+              0x31e81a9d,
+              0xf44f2413,
+              0x52382fa7,
+              0x63d0353a,
+              0xc5a73e8e,
+              0x33ef4e67,
+              0x959845d3,
+              0xa4705f4e,
+              0x20754fa,
+              0xc7a06a74,
+              0x61d761c0,
+              0x503f7b5d,
+              0xf64870e9,
+          },
+
+          {
+              0x0,
+              0x67de9cce,
+              0xcfbd399c,
+              0xa863a552,
+              0x440b7579,
+              0x23d5e9b7,
+              0x8bb64ce5,
+              0xec68d02b,
+              0x8816eaf2,
+              0xefc8763c,
+              0x47abd36e,
+              0x20754fa0,
+              0xcc1d9f8b,
+              0xabc30345,
+              0x3a0a617,
+              0x647e3ad9,
+          },
+
+          {
+              0x0,
+              0xcb5cd3a5,
+              0x4dc8a10b,
+              0x869472ae,
+              0x9b914216,
+              0x50cd91b3,
+              0xd659e31d,
+              0x1d0530b8,
+              0xec53826d,
+              0x270f51c8,
+              0xa19b2366,
+              0x6ac7f0c3,
+              0x77c2c07b,
+              0xbc9e13de,
+              0x3a0a6170,
+              0xf156b2d5,
+          },
+
+          {
+              0x0,
+              0x3d6029b,
+              0x7ac0536,
+              0x47a07ad,
+              0xf580a6c,
+              0xc8e08f7,
+              0x8f40f5a,
+              0xb220dc1,
+              0x1eb014d8,
+              0x1d661643,
+              0x191c11ee,
+              0x1aca1375,
+              0x11e81eb4,
+              0x123e1c2f,
+              0x16441b82,
+              0x15921919,
+          },
+
+          {
+              0x0,
+              0x3d6029b0,
+              0x7ac05360,
+              0x47a07ad0,
+              0xf580a6c0,
+              0xc8e08f70,
+              0x8f40f5a0,
+              0xb220dc10,
+              0x30704bc1,
+              0xd106271,
+              0x4ab018a1,
+              0x77d03111,
+              0xc5f0ed01,
+              0xf890c4b1,
+              0xbf30be61,
+              0x825097d1,
+          },
+
+          {
+              0x0,
+              0x60e09782,
+              0xc1c12f04,
+              0xa121b886,
+              0x58f35849,
+              0x3813cfcb,
+              0x9932774d,
+              0xf9d2e0cf,
+              0xb1e6b092,
+              0xd1062710,
+              0x70279f96,
+              0x10c70814,
+              0xe915e8db,
+              0x89f57f59,
+              0x28d4c7df,
+              0x4834505d,
+          },
+
+          {
+              0x0,
+              0xb8bc6765,
+              0xaa09c88b,
+              0x12b5afee,
+              0x8f629757,
+              0x37def032,
+              0x256b5fdc,
+              0x9dd738b9,
+              0xc5b428ef,
+              0x7d084f8a,
+              0x6fbde064,
+              0xd7018701,
+              0x4ad6bfb8,
+              0xf26ad8dd,
+              0xe0df7733,
+              0x58631056,
+          },
+
+          {
+              0x0,
+              0x5019579f,
+              0xa032af3e,
+              0xf02bf8a1,
+              0x9b14583d,
+              0xcb0d0fa2,
+              0x3b26f703,
+              0x6b3fa09c,
+              0xed59b63b,
+              0xbd40e1a4,
+              0x4d6b1905,
+              0x1d724e9a,
+              0x764dee06,
+              0x2654b999,
+              0xd67f4138,
+              0x866616a7,
+          },
+
+          {
+              0x0,
+              0x1c26a37,
+              0x384d46e,
+              0x246be59,
+              0x709a8dc,
+              0x6cbc2eb,
+              0x48d7cb2,
+              0x54f1685,
+              0xe1351b8,
+              0xfd13b8f,
+              0xd9785d6,
+              0xc55efe1,
+              0x91af964,
+              0x8d89353,
+              0xa9e2d0a,
+              0xb5c473d,
+          },
+
+          {
+              0x0,
+              0x1c26a370,
+              0x384d46e0,
+              0x246be590,
+              0x709a8dc0,
+              0x6cbc2eb0,
+              0x48d7cb20,
+              0x54f16850,
+              0xe1351b80,
+              0xfd13b8f0,
+              0xd9785d60,
+              0xc55efe10,
+              0x91af9640,
+              0x8d893530,
+              0xa9e2d0a0,
+              0xb5c473d0,
+          },
+
+          {
+              0x0,
+              0x191b3141,
+              0x32366282,
+              0x2b2d53c3,
+              0x646cc504,
+              0x7d77f445,
+              0x565aa786,
+              0x4f4196c7,
+              0xc8d98a08,
+              0xd1c2bb49,
+              0xfaefe88a,
+              0xe3f4d9cb,
+              0xacb54f0c,
+              0xb5ae7e4d,
+              0x9e832d8e,
+              0x87981ccf,
+          },
+
+          {
+              0x0,
+              0x4ac21251,
+              0x958424a2,
+              0xdf4636f3,
+              0xf0794f05,
+              0xbabb5d54,
+              0x65fd6ba7,
+              0x2f3f79f6,
+              0x3b83984b,
+              0x71418a1a,
+              0xae07bce9,
+              0xe4c5aeb8,
+              0xcbfad74e,
+              0x8138c51f,
+              0x5e7ef3ec,
+              0x14bce1bd,
+          },
+
+          {
+              0x0,
+              0x77073096,
+              0xee0e612c,
+              0x990951ba,
+              0x76dc419,
+              0x706af48f,
+              0xe963a535,
+              0x9e6495a3,
+              0xedb8832,
+              0x79dcb8a4,
+              0xe0d5e91e,
+              0x97d2d988,
+              0x9b64c2b,
+              0x7eb17cbd,
+              0xe7b82d07,
+              0x90bf1d91,
+          },
+
+          {
+              0x0,
+              0x1db71064,
+              0x3b6e20c8,
+              0x26d930ac,
+              0x76dc4190,
+              0x6b6b51f4,
+              0x4db26158,
+              0x5005713c,
+              0xedb88320,
+              0xf00f9344,
+              0xd6d6a3e8,
+              0xcb61b38c,
+              0x9b64c2b0,
+              0x86d3d2d4,
+              0xa00ae278,
+              0xbdbdf21c,
+          },
+      };
+
+      const int num_nibbles_parallel = 64;
+
+      const int num_sections = accessor_isz / (num_nibbles_parallel /
+                                               2);  // how many loop iterations
+      unsigned int result = ~0;
+
+      for (int i = 0; i < num_sections; i++) {
+        unsigned int result_update_odd = 0;
+        unsigned int result_update_even = 0;
+// which 4 bit chunk within the section -- this loop can be unrolled, the
+// total update for the crc is the xor of the updates from the nibbles
+        #pragma unroll
+        for (int nib = 0; nib < num_nibbles_parallel; nib++) {
+          unsigned char this_input_nibble =
+              (acc_pibuf[(i * num_nibbles_parallel + nib) / 2] >>
+               (4 * (nib % 2)));
+          unsigned char this_result_nibble =
+              (nib < 8) ? (result >> (4 * nib)) : 0;
+          unsigned char this_table_index =
+              this_input_nibble ^ this_result_nibble;
+          if (nib % 2) {
+            result_update_odd ^= table64[nib][this_table_index & 0xf];
+          } else {
+            result_update_even ^= table64[nib][this_table_index & 0xf];
+          }
+        }
+        result = result_update_odd ^ result_update_even;
+      }
+
+      accresult_crc[0] = ~result;
+    });
+  });
+
+  e_lz = q.submit([&](handler &h) {
+    auto accessor_isz = block_size;
+    auto acc_pibuf = pibuf->get_access<access::mode::read>(h);
+
+    h.single_task<LZReduction<engineID>>([=]() [[intel::kernel_args_restrict]] {
+      //-------------------------------------
+      //   Hash Table(s)
+      //-------------------------------------
+
+      [[intelfpga::singlepump]] [[intelfpga::numbanks(kVec)]] [
+          [intelfpga::max_replicates(kVec)]] struct {
+        unsigned char s[kLen];
+      } dictionary[kDepth][kVec];
+
+      [[intelfpga::singlepump]] [[intelfpga::numbanks(kVec)]] [
+          [intelfpga::max_replicates(
+              kVec)]] unsigned int dict_offset[kDepth][kVec];
+
+      // Initialize history to empty.
+      for (int i = 0; i < kDepth; i++) {
+        Unroller<0, kVec>::step([&](int k) { dict_offset[i][k] = 0; });
+      }
+
+      // This is the window of data on which we look for matches
+      // We fetch twice our data size because we have kVec offsets
+      unsigned char current_window[kVecX2];
+
+      // This is the window of data on which we look for matches
+      // We fetch twice our data size because we have kVec offsets
+      unsigned char compare_window[kLen][kVec][kVec];
+      // kVec bytes per dict----------|    |   |
+      // kVec dictionaries-----------------|   |
+      // one for each curr win offset---------|
+
+      // load offset into these arrays
+      unsigned int compare_offset[kVec][kVec];
+      // one per kVec bytes----------|     |
+      // one for each compwin-------------|
+
+      // Initialize input stream position
+      unsigned int inpos_minus_vec_div_16 = 0;
+
+      // this is ceiling of (insize-kVec)/16, original comparison was
+      // inpos < insize, now inpos is carried as (inpos-kVec)/16 so this is what
+      // we compare to
+      unsigned int insize_compare = (accessor_isz) / kVec;
+
+      int ctr = insize_compare = insize_compare - 1;
+
+      char first_valid_pos = 0;
+
+      struct DistLen dist_offs_data;
+
+      int distchan_ndx = 0;
+      size_t inpos = 0;
+
+      // load in new data
+      struct LzInput in;
+      Unroller<0, kVec>::step([&](int i) { in.data[i] = acc_pibuf[inpos++]; });
+
+      Unroller<0, kVec>::step(
+          [&](int i) { current_window[i + kVec] = in.data[i]; });
+
+      do {
+        //-----------------------------
+        // Prepare current window
+        //-----------------------------
+
+        // shift current window
+        Unroller<0, kVec>::step(
+            [&](int i) { current_window[i] = current_window[i + kVec]; });
+
+        // load in new data
+        Unroller<0, kVec>::step(
+            [&](int i) { in.data[i] = acc_pibuf[inpos++]; });
+
+        Unroller<0, kVec>::step(
+            [&](int i) { current_window[kVec + i] = in.data[i]; });
+
+        //-----------------------------
+        // Compute hash
+        //-----------------------------
+
+        unsigned short hash[kVec];
+
+        Unroller<0, kVec>::step([&](int i) {
+          hash[i] = (current_window[i] ^ (current_window[i + 1] << 6) ^
+                     (current_window[i + 2] << 2) ^ current_window[i + 3]) &
+                    kHashMask;
+        });
+
+        //-----------------------------
+        // Dictionary look-up
+        //-----------------------------
+
+        // loop over kVec compare windows, each has a different hash
+        Unroller<0, kVec>::step([&](int i) {
+          // loop over all kVec bytes
+          Unroller<0, kLen>::step([&](int j) {
+            Unroller<0, kVec>::step([&](int k) {
+              compare_window[k][j][i] = dictionary[hash[i]][j].s[k];
+            });
+          });
+        });
+
+        // loop over compare windows
+        Unroller<0, kVec>::step([&](int i) {
+          Unroller<0, kLen>::step([&](int j) {
+            // loop over frames in this compare window
+            // (they come from different dictionaries)
+            compare_offset[j][i] = dict_offset[hash[i]][j];
+          });
+        });
+
+        //-----------------------------
+        // Dictionary update
+        //-----------------------------
+
+        // loop over different dictionaries to store different frames
+        // store one frame per dictionary
+        // loop over kVec bytes to store
+        Unroller<0, kLen>::step([&](int i) {
+          Unroller<0, kVec>::step([&](int j) {
+            // store actual bytes
+            dictionary[hash[i]][i].s[j] = current_window[i + j];
+          });
+        });
+
+        Unroller<0, kVec>::step([&](int i) {
+          // loop over kVec different dictionaries and write one word to each
+          dict_offset[hash[i]][i] =
+              (inpos_minus_vec_div_16 << 4) |
+              i;  // inpos - kVec + 0, we know that inpos - kVec has 0 as the 4
+                  // lower bits so really just concatenate
+        });
+
+        //-----------------------------
+        // Match search
+        //-----------------------------
+
+        // arrays to store length, best length etc..
+        unsigned char length[kVec];
+        bool done[kVec];
+        char best_length[kVec];
+        unsigned int best_offset[kVec];
+
+        // initialize best_length
+        Unroller<0, kVec>::step([&](int i) {
+          best_length[i] = 0;
+          best_offset[i] = 0;
+        });
+
+        // loop over each comparison window frame
+        // one comes from each dictionary
+        Unroller<0, kVec>::step([&](int i) {
+          // initialize length and done
+          Unroller<0, kVec>::step([&](int l) {
+            length[l] = 0;
+            done[l] = 0;
+          });
+
+          // loop over each current window
+          Unroller<0, kVec>::step([&](int j) {
+            // loop over each char in the current window
+            // and corresponding char in comparison window
+            Unroller<0, kLen>::step([&](int k) {
+              bool comp =
+                  current_window[k + j] == compare_window[k][i][j] && !done[j];
+              length[j] += comp;
+              done[j] = !comp;
+            });
+          });
+
+          // Check if this the best length
+          Unroller<0, kVec>::step([&](int m) {
+            bool update_best =
+                (length[m] > best_length[m]) && (compare_offset[i][m] != 0) &&
+                (((inpos_minus_vec_div_16 << kVecPow) | (i & (kVec - 1))) -
+                     (compare_offset[i][m]) <
+                 kMaxDistance);
+
+            unsigned int new_offset =
+                (((inpos_minus_vec_div_16 << kVecPow) | (m & (kVec - 1))) &
+                 0x7ffff) -
+                ((compare_offset[i][m] & 0x7ffff));
+
+            // Reconsider if new_offset is bigger than current offset, might
+            // take more bytes to encode
+            update_best = update_best && (length[m] == best_length[m]) &&
+                                  (new_offset > best_offset[m])
+                              ? false
+                              : update_best;
+
+            best_offset[m] = (update_best ? new_offset : best_offset[m]) &
+                             0x7ffff;  // 19 bits is sufficient
+
+            best_length[m] = (update_best ? length[m] : best_length[m]) &
+                             0x1f;  // 5 bits is sufficient
+          });
+        });
+
+        //-----------------------------
+        // Filter matches step 1
+        //-----------------------------
+
+        // remove matches with offsets that are <= 0: this means they're
+        // self-matching or didn't match and keep only the matches that, when
+        // encoded, take fewer bytes than the actual match length
+        Unroller<0, kVec>::step([&](int i) {
+          best_length[i] = (((best_length[i] & 0x1f) >= 3) &&
+                                    ((best_offset[i]) < kMaxDistance)
+                                ? best_length[i]
+                                : 0) &
+                           0x1f;  // 5 bits is sufficient
+
+          // Second level filter - remove matches with len 3, greater than
+          // kTooFar
+          best_length[i] =
+              (((best_length[i] & 0x1f) == 3) && ((best_offset[i]) > kTooFar)
+                   ? 0
+                   : best_length[i]) &
+              0x1f;  // 5 bits is sufficient
+                     // don't emmit matches for last iteration as some of the
+                     // second part of the window might be undefined
+          if (ctr == 0) best_length[i] = 0;
+        });
+
+        //-----------------------------
+        // Assign first_valid_pos
+        //-----------------------------
+
+        // first_valid_pos is loop-carried, and tricky to compute.  So first
+        // compute it speculatively in parallel for every possible value of the
+        // previous first_valid_pos.
+        char first_valid_pos_speculative[kVec];
+
+        Unroller<0, kVec>::step([&](int guess) {
+          unsigned char next_match_search = guess;
+          Unroller<0, kVec>::step([&](int i) {
+            unsigned int len = best_length[i];
+
+            // Skip to the next match
+            next_match_search =
+                i >= next_match_search && len > 0 ? i + len : next_match_search;
+          });
+
+          first_valid_pos_speculative[guess] =
+              next_match_search - kVec > 0 ? next_match_search - kVec : 0;
+        });
+
+        // For kVec=16 (the largest currently supported), this should be a 16:1
+        // mux, which is 2 6LUTs deep.  For larger kVec, it will be worse.
+        unsigned char current_valid_pos = first_valid_pos;
+        first_valid_pos =
+            first_valid_pos_speculative[first_valid_pos & (kVec - 1)] &
+            (kVec -
+             1);  // first_valid_pos only needs 4 bits, make this explicit
+
+        // greedy match selection
+        Unroller<0, (kVec)>::step([&](int i) {
+          unsigned int len = best_length[i];
+          best_length[i] = i < current_valid_pos ? -1 : best_length[i];
+          // Skip to the next match
+          current_valid_pos =
+              i >= current_valid_pos && len > 0 ? i + len : current_valid_pos;
+        });
+
+        //-----------------------------
+        // Setup LZ dist/len pairs to push to Huffman encode kernel
+        //-----------------------------
+
+        Unroller<0, kVec>::step([&](int i) {
+          dist_offs_data.data[i] = 0;
+          dist_offs_data.len[i] = -1;
+          dist_offs_data.dist[i] = -1;
+          if (best_length[i] >= 0) {
+            dist_offs_data.data[i] = current_window[i];
+            dist_offs_data.len[i] = best_length[i];
+            dist_offs_data.dist[i] = best_offset[i];
+          }
+        });
+
+        acc_dist_channel::write(dist_offs_data);
+
+        // increment input position
+        inpos_minus_vec_div_16++;
+        distchan_ndx += 1;
+        ctr--;
+
+      } while (ctr >= 0);
+
+      const char lasti = accessor_isz - (accessor_isz & ~(kVec - 1));
+      const char firstpos = first_valid_pos;
+      Unroller<0, kVec>::step([&](unsigned char i) {
+        dist_offs_data.data[i] = 0;
+        dist_offs_data.len[i] = -1;
+        dist_offs_data.dist[i] = -1;
+      });
+
+      Unroller<0, kVec>::step([&](unsigned char i) {
+        bool pred =
+            ((i - firstpos) < (lasti - firstpos)) && ((i - firstpos) >= 0);
+        dist_offs_data.data[i] = pred ? current_window[i + kVec] : 0;
+        dist_offs_data.len[i] = pred ? 0 : -1;
+      });
+
+      acc_dist_channel_last::write(dist_offs_data);
+    });
+  });
+
+  e_huff = q.submit([&](handler &h) {
+    auto accessor_isz = block_size;
+    auto acc_gzip_out =
+        gzip_out_buf->get_access<access::mode::discard_write>(h);
+    auto accessor_output = pobuf->get_access<access::mode::discard_write>(h);
+    auto acc_eof = last_block ? 1 : 0;
+    h.single_task<StaticHuffman<engineID>>([=
+    ]() [[intel::kernel_args_restrict]] {
+      unsigned int leftover[kVec] = {0};
+      Unroller<0, kVec>::step([&](int i) { leftover[i] = 0; });
+
+      unsigned short leftover_size = 0;
+
+      unsigned int outpos_huffman = 0;
+
+      int ctr = ((accessor_isz) / kVec) + 2;
+      int odx = 0;
+
+      // Add the gzip start block marker. Assumes static huffman trees.
+      leftover_size = 3;
+      leftover[0] = ((kStaticTrees << 1) + (acc_eof));
+
+      do {
+        struct DistLen in;
+        // init the input structure for the gzip end block marker.
+        // this is the very last data block to be encoded and written.
+        Unroller<0, kVec>::step([&](int i) {
+          in.len[i] = -1;
+          in.dist[i] = -1;
+          in.data[i] = 0;
+        });
+        in.len[0] = ctr == 1 ? -3 : -1;
+        in.data[0] = 0;
+
+        in = ctr > 2 ? acc_dist_channel::read()
+                     : (ctr == 2 ? acc_dist_channel_last::read() : in);
+
+        struct HuffmanOutput outdata;
+        outdata.write = HufEnc(in.len, in.dist, in.data, outdata.data, leftover,
+                               &leftover_size);
+
+        // prevent out of bounds write
+        if (((ctr == 0) || outdata.write) && (odx < accessor_isz)) {
+          Unroller<0, kVec * sizeof(unsigned int)>::step([&](int i) {
+            accessor_output[odx + i] =
+                (ctr == 0) ? (unsigned char)(leftover[(i >> 2) & 0xf] >>
+                                             ((i & 3) << 3))
+                           : (unsigned char)(outdata.data[(i >> 2) & 0xf] >>
+                                             ((i & 3) << 3));
+          });
+        }
+
+        outpos_huffman = outdata.write ? outpos_huffman + 1 : outpos_huffman;
+        odx += outdata.write ? (sizeof(unsigned int) << kVecPow) : 0;
+
+      } while (ctr--);
+
+      // Store summary values from lz and huffman
+      acc_gzip_out[0].compression_sz =
+          (outpos_huffman * sizeof(unsigned int) * kVec) +
+          (leftover_size + 7) / 8;
+    });
+  });
+}
+
+void SubmitGzipTasks(queue &q,
+                     size_t block_size,  // size of block to compress.
+                     buffer<char, 1> *pibuf, buffer<char, 1> *pobuf,
+                     buffer<struct GzipOutInfo, 1> *gzip_out_buf,
+                     buffer<unsigned, 1> *result_crc, bool last_block,
+                     event &e_crc, event &e_lz, event &e_huff,
+                     size_t engineID) {
+  // Statically declare the engines so that the hardware is created for them.
+  // But at run time, the host can dynamically select which engine(s) to use via
+  // engineID.
+  if (engineID == 0) {
+    SubmitGzipTasksSingleEngine<0>(q, block_size, pibuf, pobuf, gzip_out_buf,
+                                   result_crc, last_block, e_crc, e_lz, e_huff);
+  }
+
+  #if NUM_ENGINES > 1
+    if (engineID == 1) {
+      SubmitGzipTasksSingleEngine<1>(q, block_size, pibuf, pobuf, gzip_out_buf,
+                                     result_crc, last_block, e_crc, e_lz, e_huff);
+    }
+  #endif
+
+  // If this reference design is to be expanded to > 2 engines, declare them here.
+
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.hpp
new file mode 100755
index 0000000000..7de9a3ea17
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/gzipkernel.hpp
@@ -0,0 +1,45 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#ifndef __GZIPKERNEL_H__
+#define __GZIPKERNEL_H__
+#pragma once
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+extern "C" void SubmitGzipTasks(
+    queue &sycl_device,
+    size_t block_size,  // size of block to compress.
+    buffer<char, 1> *pibuf, buffer<char, 1> *pobuf,
+    buffer<struct GzipOutInfo, 1> *gzip_out_buf,
+    buffer<unsigned, 1> *current_crc, bool last_block, event &e_crc,
+    event &e_lz, event &e_huff, size_t engineID);
+
+#endif  //__GZIPKERNEL_H__
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/kernels.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/kernels.hpp
new file mode 100755
index 0000000000..65f207bab7
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/gzip/src/kernels.hpp
@@ -0,0 +1,148 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#ifndef __KERNELS_H__
+#define __KERNELS_H__
+#pragma once
+
+#ifndef NUM_ENGINES
+  #define NUM_ENGINES 1
+#endif
+
+constexpr int kNumEngines = NUM_ENGINES;
+
+// kVecPow == 2 means kVec == 4.
+// kVecPow == 3 means kVec == 8.
+// kVecPow == 4 means kVec == 16.
+constexpr int kVecPow = 4;
+
+constexpr int kVec = 1 << kVecPow;
+constexpr int kVecX2 = 2 * kVec;
+
+constexpr int kHufTableSize = 256;
+
+// Maximum length of huffman codes
+constexpr int kMaxHuffcodeBits = 16;
+
+struct Uint2Gzip {
+  unsigned int y;
+  unsigned int x;
+};
+
+struct LzInput {
+  unsigned char data[kVec];
+};
+
+typedef struct DistLen {
+  unsigned char data[kVec];
+  char len[kVec];
+  short dist[kVec];
+} DistLen, *pdist_len_t;
+
+struct HuffmanOutput {
+  unsigned int data[kVec];
+  bool write;
+};
+
+struct TrailingOutput {
+  int bytecount_left;
+  int bytecount;
+  unsigned char bytes[kVec * sizeof(unsigned int)];
+};
+
+struct GzipOutInfo {
+  // final compressed block size
+  size_t compression_sz;
+  unsigned long crc;
+};
+
+// kLen must be == kVec
+constexpr int kLen = kVec;
+
+// depth of the dictionary buffers
+constexpr int kDepth = 512;
+
+// Assumes kDepth is a power of 2 number.
+constexpr int kHashMask = kDepth - 1;
+
+#define CONSTANT __constant
+
+constexpr int kDebug = 1;
+#define TRACE(x)          \
+  do {                    \
+    if (kDebug) printf x; \
+  } while (0)
+
+constexpr int kStaticTrees = 1;
+
+typedef struct CtData {
+  unsigned short code;
+  unsigned short len;
+} CtData;
+
+constexpr int kMaxMatch = 258;
+constexpr int kMinMatch = 3;
+
+constexpr int kTooFar = 4096;
+
+// All codes must not exceed kMaxBits
+constexpr int kMaxBits = 15;
+
+// number of length codes, not counting the special kEndBlock code
+constexpr int kLengthCodes = 29;
+
+// number of literal bytes, 0..255
+constexpr int kLiterals = 256;
+
+// end of literal code block
+constexpr int kEndBlock = 256;
+
+// number of literal or length codes, including kEndBlock
+constexpr int kLCodes = (kLiterals + 1 + kLengthCodes);
+
+// number of distance codes
+constexpr int kDCodes = 30;
+
+// number of codes used to transfer the bit lengths
+constexpr int kBLCodes = 19;
+
+constexpr int kMaxDistance = ((32 * 1024));
+
+constexpr int kMinBufferSize = 16384;
+
+struct DictString {
+  unsigned char s[kLen];
+};
+
+// Mapping from a distance to a distance code. dist is the distance - 1 and
+// must not have side effects. dist_code[256] and dist_code[257] are never
+// used.
+#define d_code(dist) \
+  ((dist) < 256 ? dist_code[dist] : dist_code[256 + ((dist) >> 7)])
+
+#endif  //__KERNELS_H__
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/CMakeLists.txt
new file mode 100755
index 0000000000..81cd1c747a
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+
+cmake_minimum_required (VERSION 2.8)
+
+project(QRD)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/License.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/README.md b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/README.md
new file mode 100755
index 0000000000..34288260cf
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/README.md
@@ -0,0 +1,239 @@
+# QR Decomposition of Matrices
+This DPC++ reference design demonstrates high-performance QR decomposition of complex matrices on FPGA.
+
+***Documentation***: The [FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a resource for general target-independent DPC++ programming. 
+
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® PAC with Intel Stratix® 10 SX FPGA; <br> Intel Xeon® CPU E5-1650 v2 @ 3.50GHz (host machine)
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit
+| What you will learn               | Implementing a high performance FPGA version of the Gram-Schmidt QR decomposition algorithm.
+| Time to complete                  | 1 hr (not including compile time)
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+
+**Performance**
+Please refer to performance disclaimer at the end of this README.
+
+| Device                                         | Throughput
+|:---                                            |:---
+| Intel® PAC with Intel Arria® 10 GX FPGA        | 25k matrices/s for matrices of size 128 * 128
+| Intel® PAC with Intel Stratix® 10 SX FPGA      | 7k matrices/s for matrices of size 256 * 256
+
+
+## Purpose
+
+This FPGA reference design demonstrates QR decomposition of matrices of complex numbers, a common operation employed in linear algebra. Matrix _A_ (input) is decomposed into a product of an orthogonal matrix _Q_ and an upper triangular matrix _R_.
+
+The algorithms employed by the reference design are the Gram-Schmidt QR decomposition algorithm and the thin QR factorization method. Background information on these algorithms can be found in Wikipedia's [QR decomposition](https://en.wikipedia.org/wiki/QR_decomposition) article. The original algorithm has been modified and optimized for performance on FPGAs in this implementation.
+
+QR decomposition is used extensively in signal processing applications such as beamforming, multiple-input multiple-output (MIMO) processing, and Space Time Adaptive Processing (STAP).
+
+
+### Matrix dimensions and FPGA resources
+
+The QR decomposition algorithm factors a complex _m_×_n_ matrix, where _m_ ≥ _n_. The algorithm computes the vector dot product of two columns of the matrix. In our FPGA implementation, the dot product is computed in a loop over the _m_ elements of the column. The loop is fully unrolled to maximize throughput. As a result, *m* complex multiplication operations are performed in parallel on the FPGA, followed by sequential additions to compute the dot product result. 
+
+We use the compiler flag `-fp-relaxed`, which permits the compiler to reorder floating point additions (i.e. to assume that floating point addition is commutative). The compiler uses this freedom to reorder the additions so that the dot product arithmetic can be optimally implemented using the FPGA's specialized floating point DSP (Digital Signal Processing) hardware.
+
+With this optimization, our FPGA implementation requires 4*m* DSPs to compute the complex floating point dot product. Thus, the matrix size is constrained by the total FPGA DSP resources available. Note that this upper bound is a consequence of this particular implementation.
+
+By default, the design is parameterized to process 128 × 128 matrices when compiled targeting Intel® PAC with Intel Arria® 10 GX FPGA. It is parameterized to process 256 × 256 matrices when compiled targeting Intel® PAC with Intel Stratix® 10 SX FPGA, a larger device.
+ 
+
+## Key Implementation Details
+| Kernel            | Description
+---                 |---
+| QRD               | Implements a modified Gram-Schmidt QR decomposition algorithm. 
+
+To optimize the performance-critical loop in its algorithm, the design leverages concepts discussed in the following FPGA tutorials: 
+* **Triangular Loop Optimization** (triangular_loop)
+* **Explicit Pipelining with `fpga_reg`** (fpga_register)
+* **Loop `ivdep` Attribute** (loop_ivdep)
+* **Unrolling Loops** (loop_unroll)
+
+ The key optimization techniques used are as follows:
+   1. Refactoring the algorithm to merge two dot products into one, reducing the total number of dot products needed to three from two. This helps us reduce the DSPs needed for the implementation.
+   2. Converting the nested loop into a single merged loop and applying Triangular Loop optimizations. This allows us to generate a design that is very well pipelined.
+   3. Fully vectorizing the dot products using loop unrolling.
+   4. Using the compiler flag -Xsfp-relaxed to re-order floating point operations and allowing the inference of a specialised dot-product DSP. This further reduces the number of DSP blocks needed by the implementation, the overall latency, and pipeline depth.
+   5. Using an efficient memory banking scheme to generate high performance hardware.
+   6. Using the `fpga_reg` attribute to insert more pipeline stages where needed to improve the frequency achieved by the design.
+
+## License  
+This code sample is licensed under MIT license.
+
+## Building the Reference Design
+
+### Include Files
+The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Code Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 24h.
+ 
+### On a Linux* System
+1. Install the design into a directory `build` from the design directory by running `cmake`:
+
+   ```
+   mkdir build
+   cd build
+   ```
+
+   If you are compiling for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:
+
+   ```
+   cmake ..
+   ```
+
+   If instead you are compiling for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following targets are provided and they match the recommended development flow:
+
+    * Compile for emulation (fast compile time, targets emulated FPGA device).
+
+       ```
+       make fpga_emu
+       ```
+
+    * Generate HTML performance report. Find the report in `qrd_report.prj/reports/report.html`directory.
+
+       ```
+       make report
+       ```
+
+    * Compile for FPGA hardware (longer compile time, targets FPGA device).
+
+       ```
+       make fpga
+       ```
+
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/qrd.fpga.tar.gz" download>here</a>.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. 
+
+Note: Ensure that Microsoft Visual Studio* (2017, or 2019 Version 16.4 or newer) with "Desktop development with C++" workload is installed on your system.
+
+1. Enter source file directory.
+
+```
+cd src
+```
+
+2. Compile the design. The following targets are provided and they match the recommended development flow:
+
+    * Compile for emulation (fast compile time, targets emulated FPGA device).
+
+      ```
+      ninja fpga_emu
+      ```
+
+    * Generate HTML performance report. Find the report in `../src/qrd_report.prj/reports/report.html`directory.
+
+      ```
+      ninja report
+      ```
+
+      If you are targeting the Intel® PAC with Intel Stratix® 10 SX FPGA, please use the following target and find the report in `../src/qrd_s10_pac_report.prj/reports/report.html`.
+
+      ```
+      ninja report_s10_pac
+      ```
+
+    * **Not supported yet:** Compile and run on an FPGA hardware.
+
+### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this Reference Design in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Running the Reference Design
+You can apply QR decomposition to a number of matrices as shown below. This step performs the following:
+* Generates the number of random matrices specified as the command line argument (defaults to 1).
+* Computes QR decomposition on all matrices.
+* Evaluates performance.
+NOTE: The design is optimized to perform best when run on a large number of matrices, where the total number of matrices is a power of 2.
+
+
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU).
+     ```
+     ./qrd.fpga_emu           (Linux)
+     qrd.fpga_emu.exe         (Windows)
+     ```
+
+2. Run the sample on the FPGA device. It is recommended to pass in an optional argument (as shown) when invoking the sample on hardware. Otherwise, the performance will not be representative.
+     ```
+     ./qrd.fpga 40960         (Linux)
+     ```
+### Application Parameters
+
+| Argument | Description
+---        |---
+| `<num>`  | Optional argument that specifies the number of matrices to decompose. Its default value is `1`.
+
+### Example of Output
+
+Example output when running on Intel® PAC with Intel Arria® 10 GX FPGA for 32768 matrices (each of consisting of 128*128 complex numbers):
+
+```
+Device name: pac_a10 : Intel PAC Platform (pac_f000000)
+Generating 32768 random matrices
+Running QR decomposition of 32768 matrices repeatedly
+   Total duration:   41.3763 s
+Throughput: 25.3425k matrices/s
+Verifying results on matrix 0 16384 32767
+PASSED
+```
+
+Example output when running on Intel® PAC with Intel Stratix® 10 SX FPGA for 40960 matrices (each of consisting of 256*256 complex numbers):
+
+```
+Device name: pac_s10 : Intel PAC Platform (pac_f100000)
+Generating 4096 random matrices
+Running QR decomposition of 4096 matrices repeatedly
+   Total duration:   17.3197 s
+Throughput: 7.5678k matrices/s
+Verifying results on matrix 0 2048 4095
+PASSED
+```
+
+## Additional Design Information
+
+### Compiler Flags Used
+
+| Flag | Description
+---    |---
+`-Xshardware` | Target FPGA hardware (as opposed to FPGA emulator)
+`-Xsclock=330MHz` | The FPGA backend attempts to achieve 330 MHz
+`-Xsfp-relaxed` | Allows the FPGA backend to re-order floating point arithmetic operations (e.g. permit assuming (a + b + c) == (c + a + b) ) 
+`-Xsparallel=2` | Use 2 cores when compiling the bitstream through Quartus
+`-Xsseed` | Specifies the Quartus compile seed, to yield slightly higher fmax
+`-DROWS_COMPONENT` | Specifies the number of rows of the matrix
+`-DCOLS_COMPONENT` | Specifies the number of columns of the matrix
+`-DFIXED_ITERATIONS` | Used to set the ivdep safelen attribute for the performance critical triangular loop
+
+NOTE: The values for `seed`, `FIXED_ITERATIONS`, `ROWS_COMPONENT`, `COLS_COMPONENT` are set according to the board being targeted.
+
+### Performance disclaimers
+
+Tests document performance of components on a particular test, in specific systems. Differences in hardware, software, or configuration will affect actual performance. Consult other sources of information to evaluate performance as you consider your purchase.  For more complete information about performance and benchmark results, visit [www.intel.com/benchmarks](www.intel.com/benchmarks).
+
+Performance results are based on testing as of July 29, 2020 and may not reflect all publicly available security updates.  See configuration disclosure for details.  No product or component can be absolutely secure.
+
+Intel technologies’ features and benefits depend on system configuration and may require enabled hardware, software or service activation. Performance varies depending on system configuration. Check with your system manufacturer or retailer or learn more at [intel.com](www.intel.com).
+
+The performance was measured by Intel on July 29, 2020.
+
+Intel and the Intel logo are trademarks of Intel Corporation or its subsidiaries in the U.S. and/or other countries.
+
+(C) Intel Corporation.
+      
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.sln b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.sln
new file mode 100755
index 0000000000..b5e086d1f5
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "qrd", "qrd.vcxproj", "{ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Debug|x64.ActiveCfg = Debug|x64
+		{ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Debug|x64.Build.0 = Debug|x64
+		{ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Release|x64.ActiveCfg = Release|x64
+		{ACDE6B7A-6F9A-428E-B040-CEDC5B1E2C79}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {97D1BD74-AAAB-4835-8F00-37A58B70871A}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.vcxproj b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.vcxproj
new file mode 100755
index 0000000000..95a7067c03
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/qrd.vcxproj
@@ -0,0 +1,170 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\qrd.cpp" />
+    <ClCompile Include="src\qrd_demo.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="src\qrd.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{acde6b7a-6f9a-428e-b040-cedc5b1e2c79}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>qrd</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR -DFIXED_ITERATIONS=64 -DROWS_COMPONENT=128 -DCOLS_COMPONENT=128 %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>
+      </ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SpecifyDevCmplAdditionalOptions>-Xsclock=330MHz;-Xsfp-relaxed;-Xsparallel=2</SpecifyDevCmplAdditionalOptions>
+    </Link>
+    <Manifest />
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR -DFIXED_ITERATIONS=64 -DROWS_COMPONENT=128 -DCOLS_COMPONENT=128 %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>
+      </ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <SpecifyDevCmplAdditionalOptions>-Xsclock=330MHz;-Xsfp-relaxed;-Xsparallel=2</SpecifyDevCmplAdditionalOptions>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
+
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/sample.json b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/sample.json
new file mode 100755
index 0000000000..aa107a266e
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/sample.json
@@ -0,0 +1,57 @@
+{
+  "guid": "3228581F-9DF8-4696-9B1C-0B31286B97C3",
+  "name": "QR Decomposition of Matrices on FPGA",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Reference Designs"],
+  "description": "Reference design demonstrating high-performance QR decomposition of complex matrices on FPGA",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "builder": ["ide", "cmake"],
+  "targetDevice": ["FPGA"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "env": [
+          "export CL_CONFIG_CPU_FORCE_PRIVATE_MEM_SIZE=32MB"
+        ],
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./qrd.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "env": [
+          "set CL_CONFIG_CPU_FORCE_PRIVATE_MEM_SIZE=32MB"
+        ],
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "qrd.fpga_emu.exe"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/CMakeLists.txt
new file mode 100755
index 0000000000..5003e6a357
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/CMakeLists.txt
@@ -0,0 +1,129 @@
+set(DEVICE_SOURCE_FILE qrd.cpp)
+set(DEVICE_HEADER_FILE qrd.hpp)
+set(HOST_SOURCE_FILE qrd_demo.cpp)
+set(TARGET_NAME qrd)
+
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+set(REPORTS_TARGET ${TARGET_NAME}_report) 
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Design specific constant values
+set(ROWS_COMPONENT_A10 128)
+set(COLS_COMPONENT_A10 128)
+
+set(ROWS_COMPONENT_S10 256)
+set(COLS_COMPONENT_S10 256)
+
+set(FIXED_ITERATIONS_A10 64)
+set(FIXED_ITERATIONS_S10 105)
+
+set(SEED_A10 5)
+set(SEED_S10 1)
+
+# Set parameter values assuming target is Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+SET(FIXED_ITERATIONS ${FIXED_ITERATIONS_A10})
+SET(SEED ${SEED_A10})
+SET(ROWS_COMPONENT ${ROWS_COMPONENT_A10})
+SET(COLS_COMPONENT ${COLS_COMPONENT_A10})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+    SET(FIXED_ITERATIONS ${FIXED_ITERATIONS_S10})
+    SET(SEED ${SEED_S10})
+    SET(ROWS_COMPONENT ${ROWS_COMPONENT_S10})
+    SET(COLS_COMPONENT ${COLS_COMPONENT_S10})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS -fintelfpga -c -DFIXED_ITERATIONS=${FIXED_ITERATIONS} -DROWS_COMPONENT=${ROWS_COMPONENT} -DCOLS_COMPONENT=${COLS_COMPONENT})
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+separate_arguments(USER_HARDWARE_FLAGS)
+set(HARDWARE_LINK_FLAGS -fintelfpga -Xshardware -Xsclock=330MHz -Xsfp-relaxed -Xsparallel=2 -Xsseed=${SEED} -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS} -DFIXED_ITERATIONS=${FIXED_ITERATIONS} -DROWS_COMPONENT=${ROWS_COMPONENT} -DCOLS_COMPONENT=${COLS_COMPONENT})
+set(FINAL_LINK_FLAGS -fintelfpga -DFIXED_ITERATIONS=${FIXED_ITERATIONS} -DROWS_COMPONENT=${ROWS_COMPONENT} -DCOLS_COMPONENT=${COLS_COMPONENT})
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR -DFIXED_ITERATIONS=${FIXED_ITERATIONS} -DROWS_COMPONENT=${ROWS_COMPONENT} -DCOLS_COMPONENT=${COLS_COMPONENT}")
+set(EMULATOR_LINK_FLAGS -fintelfpga )
+
+# fpga emulator
+if(WIN32)
+    set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+    add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+    separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+    add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+                   COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} ${CMAKE_CURRENT_SOURCE_DIR}/${HOST_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+                   DEPENDS ${DEVICE_SOURCE_FILE} ${HOST_SOURCE_FILE})
+else()
+    add_executable(${EMULATOR_TARGET} ${DEVICE_SOURCE_FILE} ${HOST_SOURCE_FILE})
+    add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpga
+if(WIN32)
+    add_custom_target(fpga
+                  COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+    set(DEVICE_FPGA_OBJ "qrd_fpga.o")
+    set(DEVICE_IMAGE_FPGA_OBJ "qrd_fpga.a")
+    set(HOST_FPGA_OBJ "qrd_host.o")
+
+    add_custom_command(OUTPUT ${DEVICE_FPGA_OBJ}
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${DEVICE_FPGA_OBJ}
+                       DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE})
+
+    add_custom_command(OUTPUT ${HOST_FPGA_OBJ}
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${HOST_SOURCE_FILE} -o ${HOST_FPGA_OBJ}
+                       DEPENDS ${HOST_SOURCE_FILE})
+
+    add_custom_command(OUTPUT ${DEVICE_IMAGE_FPGA_OBJ}
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link=image ${DEVICE_FPGA_OBJ} -o ${DEVICE_IMAGE_FPGA_OBJ}
+                       DEPENDS ${DEVICE_FPGA_OBJ})
+
+    add_custom_command(OUTPUT ${FPGA_TARGET}
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${FINAL_LINK_FLAGS} ${HOST_FPGA_OBJ} ${DEVICE_IMAGE_FPGA_OBJ} -o  ${CMAKE_BINARY_DIR}/${FPGA_TARGET}
+                       DEPENDS ${DEVICE_IMAGE_FPGA_OBJ} ${HOST_FPGA_OBJ})
+endif()
+
+# fpga report
+if(WIN32)
+    add_custom_target(report DEPENDS ${REPORTS_TARGET} )
+
+    separate_arguments(WIN_FLAGS WINDOWS_COMMAND)
+    add_custom_command(OUTPUT ${REPORTS_TARGET}
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${WIN_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET}
+                      DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE})
+
+else()
+    add_custom_target(report DEPENDS ${REPORTS_TARGET} )
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} ${DEVICE_SOURCE_FILE} COPYONLY)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_HEADER_FILE} ${DEVICE_HEADER_FILE} COPYONLY)
+
+    add_custom_command(OUTPUT ${REPORTS_TARGET}
+                      COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS} -fsycl-link ${DEVICE_SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORTS_TARGET}
+                      DEPENDS ${DEVICE_SOURCE_FILE} ${DEVICE_HEADER_FILE})
+endif()
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}.fpga_emu
+                  DEPENDS ${TARGET_NAME}.fpga_emu)
+                  
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/build.ninja b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/build.ninja
new file mode 100755
index 0000000000..619923b204
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/build.ninja
@@ -0,0 +1,32 @@
+device_source_file = qrd.cpp
+device_header_file = qrd.hpp
+host_source_file = qrd_demo.cpp
+target_name = qrd
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware -Xsclock=330MHz -Xsfp-relaxed -Xsparallel=2
+emulator_flags = -fintelfpga -DFPGA_EMULATOR -Xsfast-emulator
+
+rule build_fpga_emu
+  command = dpcpp /GX ${emulator_flags} ${device_source_file} ${host_source_file} ${design_flags} -DFIXED_ITERATIONS=64 -DROWS_COMPONENT=128 -DCOLS_COMPONENT=128 -o $out
+
+rule gen_report
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 ${device_source_file} ${host_source_file} -DFIXED_ITERATIONS=64 -DROWS_COMPONENT=128 -DCOLS_COMPONENT=128 -Xsseed=5 -fsycl-link -o $out
+
+rule gen_report_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 ${device_source_file} ${host_source_file} -DFIXED_ITERATIONS=105 -DROWS_COMPONENT=256 -DCOLS_COMPONENT=256 -Xsseed=1 -fsycl-link -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu 
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.cpp
new file mode 100755
index 0000000000..a6d973cbaa
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.cpp
@@ -0,0 +1,318 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <chrono>
+#include <cstring>
+#include <vector>
+
+#include "qrd.hpp"
+
+using std::vector;
+using namespace sycl;
+
+template <int begin, int end>
+struct Unroller {
+  template <typename Action>
+  static void Step(const Action &action) {
+    action(begin);
+    Unroller<begin + 1, end>::Step(action);
+  }
+};
+
+template <int end>
+struct Unroller<end, end> {
+  template <typename Action>
+  static void Step(const Action &action) {}
+};
+
+struct MyComplex {
+  float xx;
+  float yy;
+  MyComplex(float x, float y) {
+    xx = x;
+    yy = y;
+  }
+  MyComplex() {}
+  const MyComplex operator+(const MyComplex other) const {
+    return MyComplex(xx + other.xx, yy + other.yy);
+  }
+};
+
+MyComplex MulMycomplex(MyComplex a, MyComplex b) {
+  MyComplex c;
+  c.xx = a.xx * b.xx + a.yy * b.yy;
+  c.yy = a.yy * b.xx - a.xx * b.yy;
+  return c;
+}
+
+// Forward declare the kernel name
+// (This will become unnecessary in a future compiler version.)
+class QRD;
+
+void QRDecomposition(vector<float> &in_matrix, vector<float> &out_matrix, queue &q,
+                size_t matrices, size_t reps) {
+  // Number of complex elements in the matrix
+  constexpr int kNumComplexElements = COLS_COMPONENT * ROWS_COMPONENT;
+
+  // Sizes of allocated memories for input and output matrix
+  constexpr int kInputMatrixSize = kNumComplexElements * 2;
+  constexpr int kOutputMatrixSize =
+      (ROWS_COMPONENT + 1) * COLS_COMPONENT * 3;
+
+  // Constants related to the memory configuration of the kernel's local
+  // memories
+  // We want 4 complex elements (2 floating point values) in each memory bank
+  constexpr int kNumElementsPerBank = 4;
+  // Set the bankwidth in bytes
+  constexpr int kBankwidth = kNumElementsPerBank * 8;
+  constexpr int kNumBanks = ROWS_COMPONENT / kNumElementsPerBank;
+
+  constexpr int kLoadIter = kNumComplexElements / kNumElementsPerBank;
+  constexpr int kStoreIter = kNumComplexElements / kNumElementsPerBank;
+  constexpr short kNumBuffers = 4;
+
+  // We will process 'chunk' number of matrices in each run of the kernel
+  short chunk = 2048;
+  if (matrices % chunk) {
+    chunk = 1;
+  }
+
+  // Create buffers and allocate space for them.
+  buffer<float, 1> *input_matrix[kNumBuffers], *output_matrix[kNumBuffers];
+  for (short i = 0; i < kNumBuffers; i++) {
+    input_matrix[i] = new buffer<float, 1>(kInputMatrixSize * chunk);
+    output_matrix[i] = new buffer<float, 1>(kOutputMatrixSize * chunk);
+  }
+
+  for (size_t r = 0; r < reps; r++) {
+    for (size_t b = 0, it = 0; it < matrices;
+         it += chunk, b = (b + 1) % kNumBuffers) {
+      const float *kPtr = in_matrix.data() + kInputMatrixSize * it;
+      float *kPtr2 = out_matrix.data() + kOutputMatrixSize * it;
+      int matrices = chunk;
+
+      q.submit([&](handler &h) {
+        auto in_matrix2 =
+            input_matrix[b]->get_access<access::mode::discard_write>(h);
+        h.copy(kPtr, in_matrix2);
+      });
+
+      q.submit([&](handler &h) {
+        auto in_matrix = input_matrix[b]->get_access<access::mode::read>(h);
+        auto out_matrix =
+            output_matrix[b]->get_access<access::mode::discard_write>(h);
+        auto out_matrix2 = out_matrix;
+        h.single_task<class QRD>([=]() [[intel::kernel_args_restrict]] {
+          for (int l = 0; l < matrices; l++) {
+            [[intelfpga::bankwidth(kBankwidth),
+              intelfpga::numbanks(kNumBanks)]] struct {
+              MyComplex d[ROWS_COMPONENT];
+            } a_matrix[COLS_COMPONENT], ap_matrix[COLS_COMPONENT],
+                aload_matrix[COLS_COMPONENT];
+
+            MyComplex vector_ai[ROWS_COMPONENT], vector_ti[ROWS_COMPONENT];
+            MyComplex s_or_i[COLS_COMPONENT];
+
+            // Copy data from DDR memory to on-chip memory.
+            int idx = l * kNumComplexElements / kNumElementsPerBank;
+            for (short li = 0; li < kLoadIter; li++) {
+              MyComplex tmp[kNumElementsPerBank];
+              Unroller<0, kNumElementsPerBank>::Step([&](int k) {
+                tmp[k].xx = in_matrix[idx * 2 * kNumElementsPerBank + k * 2];
+                tmp[k].yy =
+                    in_matrix[idx * 2 * kNumElementsPerBank + k * 2 + 1];
+              });
+
+              idx++;
+              int jtmp = li % (kNumBanks);
+
+              Unroller<0, kNumBanks>::Step([&](int k) {
+                Unroller<0, kNumElementsPerBank>::Step([&](int t) {
+                  if (jtmp == k) {
+                    aload_matrix[li / (kNumBanks)]
+                        .d[k * kNumElementsPerBank + t].xx = tmp[t].xx;
+                    aload_matrix[li / (kNumBanks)]
+                        .d[k * kNumElementsPerBank + t].yy = tmp[t].yy;
+                  }
+
+                  // Delay data signals to create a vine-based data distribution
+                  // to lower signal fanout.
+                  tmp[t].xx = intel::fpga_reg(tmp[t].xx);
+                  tmp[t].yy = intel::fpga_reg(tmp[t].yy);
+                });
+
+                jtmp = intel::fpga_reg(jtmp);
+              });
+            }
+
+            float p_ii_x, i_r_ii_x;
+            short i = -1;
+            short j = N_VALUE - FIXED_ITERATIONS < 0
+                          ? (N_VALUE - FIXED_ITERATIONS)
+                          : 0;
+            int qr_idx = l * kOutputMatrixSize / 2;
+
+            [[intelfpga::ii(1)]] [[intelfpga::ivdep(FIXED_ITERATIONS)]]
+            for (int s = 0; s < ITERATIONS; s++) {
+              MyComplex vector_t[ROWS_COMPONENT];
+              MyComplex sori[kNumBanks];
+
+              bool j_eq_i[kNumBanks], i_gt_0[kNumBanks],
+                  i_ge_0_j_eq_i[kNumBanks], j_eq_i_plus_1[kNumBanks],
+                  i_lt_0[kNumBanks];
+
+              Unroller<0, kNumBanks>::Step([&](int k) {
+                i_gt_0[k] = intel::fpga_reg(i > 0);
+                i_lt_0[k] = intel::fpga_reg(i < 0);
+                j_eq_i[k] = intel::fpga_reg(j == i);
+                i_ge_0_j_eq_i[k] = intel::fpga_reg(i >= 0 && j >= i);
+                j_eq_i_plus_1[k] = intel::fpga_reg(j == i + 1);
+                sori[k].xx = intel::fpga_reg(s_or_i[j].xx);
+                sori[k].yy = intel::fpga_reg(s_or_i[j].yy);
+              });
+
+              Unroller<0, ROWS_COMPONENT>::Step([&](int k) {
+                vector_t[k].xx = aload_matrix[j].d[k].xx;
+                vector_t[k].yy = aload_matrix[j].d[k].yy;
+                if (i_gt_0[k / kNumElementsPerBank]) {
+                  vector_t[k].xx = a_matrix[j].d[k].xx;
+                  vector_t[k].yy = a_matrix[j].d[k].yy;
+                }
+                if (j_eq_i[k / kNumElementsPerBank]) {
+                  vector_ai[k].xx = vector_t[k].xx;
+                  vector_ai[k].yy = vector_t[k].yy;
+                }
+              });
+
+              Unroller<0, ROWS_COMPONENT>::Step([&](int k) {
+                vector_t[k] =
+                    MulMycomplex(vector_ai[k],
+                                 i_lt_0[k / kNumElementsPerBank]
+                                     ? MyComplex(0.0, 0.0)
+                                     : sori[k / kNumElementsPerBank]) +
+                    (j_eq_i[k / kNumElementsPerBank] ? MyComplex(0.0, 0.0)
+                                                     : vector_t[k]);
+                if (i_ge_0_j_eq_i[k / kNumElementsPerBank]) {
+                  ap_matrix[j].d[k].xx = a_matrix[j].d[k].xx =
+                    vector_t[k].xx;
+                  ap_matrix[j].d[k].yy = a_matrix[j].d[k].yy =
+                    vector_t[k].yy;
+                }
+                if (j_eq_i_plus_1[k / kNumElementsPerBank]) {
+                  vector_ti[k] = vector_t[k];
+                }
+              });
+
+              MyComplex p_ij = MyComplex(0, 0);
+              Unroller<0, ROWS_COMPONENT>::Step([&](int k) {
+                p_ij = p_ij + MulMycomplex(vector_t[k], vector_ti[k]);
+              });
+
+              if (j == i + 1) {
+                p_ii_x = p_ij.xx;
+                i_r_ii_x = rsqrt(p_ij.xx);
+              }
+
+              MyComplex s_ij =
+                  MyComplex(0.0f - (p_ij.xx) / p_ii_x, p_ij.yy / p_ii_x);
+
+              if (j >= 0) {
+                s_or_i[j] = MyComplex(j == i + 1 ? i_r_ii_x : s_ij.xx,
+                                      j == i + 1 ? 0.0f : s_ij.yy);
+              }
+
+              MyComplex r_ii = j == i + 1 ? MyComplex(sycl::sqrt(p_ii_x), 0.0)
+                                          : MyComplex(i_r_ii_x * p_ij.xx,
+                                                      i_r_ii_x * p_ij.yy);
+
+              if (j >= i + 1 && i + 1 < N_VALUE) {
+                out_matrix[qr_idx * 2] = r_ii.xx;
+                out_matrix[qr_idx * 2 + 1] = r_ii.yy;
+                qr_idx++;
+              }
+
+              if (j == N_VALUE - 1) {
+                j = ((N_VALUE - FIXED_ITERATIONS) > i)
+                        ? (i + 1)
+                        : (N_VALUE - FIXED_ITERATIONS);
+                i++;
+              } else {
+                j++;
+              }
+            }
+
+            qr_idx /= 4;
+            for (short si = 0; si < kStoreIter; si++) {
+              int desired = si % (kNumBanks);
+              bool get[kNumBanks];
+              Unroller<0, kNumBanks>::Step([&](int k) {
+                get[k] = desired == k;
+                desired = intel::fpga_reg(desired);
+              });
+
+              MyComplex tmp[kNumElementsPerBank];
+              Unroller<0, kNumBanks>::Step([&](int t) {
+                Unroller<0, kNumElementsPerBank>::Step([&](int k) {
+                  tmp[k].xx = get[t] ? ap_matrix[si / (kNumBanks)]
+                                            .d[t * kNumElementsPerBank + k]
+                                            .xx
+                                      : intel::fpga_reg(tmp[k].xx);
+                  tmp[k].yy = get[t] ? ap_matrix[si / (kNumBanks)]
+                                            .d[t * kNumElementsPerBank + k]
+                                            .yy
+                                      : intel::fpga_reg(tmp[k].yy);
+                });
+              });
+
+              Unroller<0, 4>::Step([&](int k) {
+                out_matrix2[qr_idx * 2 * kNumElementsPerBank + k * 2] =
+                    tmp[k].xx;
+                out_matrix2[qr_idx * 2 * kNumElementsPerBank + k * 2 + 1] =
+                    tmp[k].yy;
+              });
+
+              qr_idx++;
+            }
+          }
+        });
+      });
+
+      q.submit([&](handler &h) {
+        auto final_matrix = output_matrix[b]->get_access<access::mode::read>(h);
+        h.copy(final_matrix, kPtr2);
+      });
+    }
+  }
+
+  for (short b = 0; b < kNumBuffers; b++) {
+    delete input_matrix[b];
+    delete output_matrix[b];
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.hpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.hpp
new file mode 100755
index 0000000000..4ada530ea7
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd.hpp
@@ -0,0 +1,43 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+// The values for FIXED_ITERATIONS, ROWS_COMPONENT and COLS_COMPONENT will be
+// supplied by the build system (cmake/build.ninja)
+
+// Architecture/Design Parameters used to implement the triagular loop
+// structure of the design. See the tutorial on triangular loop optimization
+// for more details.
+#define N_VALUE COLS_COMPONENT
+
+#define M_MINUS_COLS \
+  (FIXED_ITERATIONS > COLS_COMPONENT ? FIXED_ITERATIONS - COLS_COMPONENT : 0)
+
+#define ITERATIONS                                                             \
+  (COLS_COMPONENT + M_MINUS_COLS + (COLS_COMPONENT + 1) * COLS_COMPONENT / 2 + \
+   FIXED_ITERATIONS * (FIXED_ITERATIONS - 1) / 2 -                             \
+   M_MINUS_COLS * (M_MINUS_COLS - 1) / 2)
diff --git a/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd_demo.cpp b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd_demo.cpp
new file mode 100755
index 0000000000..4bee78a672
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/ReferenceDesigns/qrd/src/qrd_demo.cpp
@@ -0,0 +1,233 @@
+// ==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// This agreement shall be governed in all respects by the laws of the State of
+// California and by the laws of the United States of America.
+
+#include <math.h>
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <chrono>
+#include <list>
+
+#include "dpc_common.hpp"
+#include "qrd.hpp"
+
+using namespace std;
+using namespace std::chrono;
+using namespace sycl;
+
+// Run the modified Gram-Schmidt QR Decomposition algorithm on the given
+// matrices. The function will do the following:
+//   1. Transfer the input matrices to the FPGA.
+//   2. Run the algorithm.
+//   3. Copy the output data back to host device.
+// The above process is carried out 'reps' number of times.
+void QRDecomposition(vector<float> &in_matrix, vector<float> &out_matrix, queue &q,
+                size_t matrices, size_t reps);
+
+int main(int argc, char *argv[]) {
+  constexpr size_t kRandomSeed = 1138;
+  constexpr size_t kRandomMin = 1;
+  constexpr size_t kRandomMax = 10;
+
+  size_t matrices = argc > 1 ? atoi(argv[1]) : 1;
+  if (matrices < 1) {
+    cout << "Must run at least 1 matrix\n";
+    return 1;
+  }
+
+  try {
+#if defined(FPGA_EMULATOR)
+    intel::fpga_emulator_selector device_selector;
+#else
+    intel::fpga_selector device_selector;
+#endif
+
+    queue q = queue(device_selector, dpc_common::exception_handler);
+    device device = q.get_device();
+    cout << "Device name: " << device.get_info<info::device::name>().c_str()
+         << "\n";
+
+    vector<float> a_matrix;
+    vector<float> qr_matrix;
+
+    constexpr size_t kAMatrixSizeFactor = ROWS_COMPONENT * COLS_COMPONENT * 2;
+    constexpr size_t kQRMatrixSizeFactor =
+        (ROWS_COMPONENT + 1) * COLS_COMPONENT * 3;
+    constexpr size_t kIndexAccessFactor = 2;
+
+    a_matrix.resize(matrices * kAMatrixSizeFactor);
+    qr_matrix.resize(matrices * kQRMatrixSizeFactor);
+
+    // For output-postprocessing
+    float q_matrix[ROWS_COMPONENT][COLS_COMPONENT][2];
+    float r_matrix[COLS_COMPONENT][COLS_COMPONENT][2];
+
+    cout << "Generating " << matrices << " random matri"
+         << ((matrices == 1) ? "x " : "ces ") << "\n";
+
+    srand(kRandomSeed);
+
+    for (size_t i = 0; i < matrices; i++) {
+      for (size_t row = 0; row < ROWS_COMPONENT; row++) {
+        for (size_t col = 0; col < COLS_COMPONENT; col++) {
+          int random_val = rand();
+          float random_double =
+              random_val % (kRandomMax - kRandomMin) + kRandomMin;
+          a_matrix[i * kAMatrixSizeFactor +
+                   col * ROWS_COMPONENT * kIndexAccessFactor +
+                   row * kIndexAccessFactor] = random_double;
+          int random_val_imag = rand();
+          random_double =
+              random_val_imag % (kRandomMax - kRandomMin) + kRandomMin;
+          a_matrix[i * kAMatrixSizeFactor +
+                   col * ROWS_COMPONENT * kIndexAccessFactor +
+                   row * kIndexAccessFactor + 1] = random_double;
+        }
+      }
+    }
+
+    QRDecomposition(a_matrix, qr_matrix, q, 1, 1);  // Accelerator warmup
+
+#if defined(FPGA_EMULATOR)
+    size_t reps = 2;
+#else
+    size_t reps = 32;
+#endif
+    cout << "Running QR decomposition of " << matrices << " matri"
+         << ((matrices == 1) ? "x " : "ces ")
+         << ((reps > 1) ? "repeatedly" : "") << "\n";
+
+    high_resolution_clock::time_point start_time = high_resolution_clock::now();
+    QRDecomposition(a_matrix, qr_matrix, q, matrices, reps);
+    high_resolution_clock::time_point end_time = high_resolution_clock::now();
+    duration<double> diff = end_time - start_time;
+    q.throw_asynchronous();
+
+    cout << "   Total duration:   " << diff.count() << " s"
+         << "\n";
+    cout << "Throughput: " << reps * matrices / diff.count() / 1000
+         << "k matrices/s"
+         << "\n";
+
+    list<size_t> to_check;
+    // We will check at least matrix 0
+    to_check.push_back(0);
+    // Spot check the last and the middle one
+    if (matrices > 2) to_check.push_back(matrices / 2);
+    if (matrices > 1) to_check.push_back(matrices - 1);
+
+    cout << "Verifying results on matrix";
+
+    for (size_t matrix : to_check) {
+      cout << " " << matrix;
+      size_t idx = 0;
+      for (size_t i = 0; i < COLS_COMPONENT; i++) {
+        for (size_t j = 0; j < COLS_COMPONENT; j++) {
+          if (j < i)
+            r_matrix[i][j][0] = r_matrix[i][j][1] = 0;
+          else {
+            r_matrix[i][j][0] = qr_matrix[matrix * kQRMatrixSizeFactor + idx++];
+            r_matrix[i][j][1] = qr_matrix[matrix * kQRMatrixSizeFactor + idx++];
+          }
+        }
+      }
+
+      for (size_t j = 0; j < COLS_COMPONENT; j++) {
+        for (size_t i = 0; i < ROWS_COMPONENT; i++) {
+          q_matrix[i][j][0] = qr_matrix[matrix * kQRMatrixSizeFactor + idx++];
+          q_matrix[i][j][1] = qr_matrix[matrix * kQRMatrixSizeFactor + idx++];
+        }
+      }
+
+      float acc_real = 0;
+      float acc_imag = 0;
+      float v_matrix[ROWS_COMPONENT][COLS_COMPONENT][2] = {{{0}}};
+      for (size_t i = 0; i < ROWS_COMPONENT; i++) {
+        for (size_t j = 0; j < COLS_COMPONENT; j++) {
+          acc_real = 0;
+          acc_imag = 0;
+          for (size_t k = 0; k < COLS_COMPONENT; k++) {
+            acc_real += q_matrix[i][k][0] * r_matrix[k][j][0] -
+                        q_matrix[i][k][1] * r_matrix[k][j][1];
+            acc_imag += q_matrix[i][k][0] * r_matrix[k][j][1] +
+                        q_matrix[i][k][1] * r_matrix[k][j][0];
+          }
+          v_matrix[i][j][0] = acc_real;
+          v_matrix[i][j][1] = acc_imag;
+        }
+      }
+
+      float error = 0;
+      size_t count = 0;
+      constexpr float kErrorThreshold = 1e-4;
+      for (size_t row = 0; row < ROWS_COMPONENT; row++) {
+        for (size_t col = 0; col < COLS_COMPONENT; col++) {
+          if (std::isnan(v_matrix[row][col][0]) ||
+              std::isnan(v_matrix[row][col][1])) {
+            count++;
+          }
+          float real = v_matrix[row][col][0] -
+                       a_matrix[matrix * kAMatrixSizeFactor +
+                                col * ROWS_COMPONENT * kIndexAccessFactor +
+                                row * kIndexAccessFactor];
+          float imag = v_matrix[row][col][1] -
+                       a_matrix[matrix * kAMatrixSizeFactor +
+                                col * ROWS_COMPONENT * kIndexAccessFactor +
+                                row * kIndexAccessFactor + 1];
+          if (sqrt(real * real + imag * imag) >= kErrorThreshold) {
+            error += sqrt(real * real + imag * imag);
+            count++;
+          }
+        }
+      }
+
+      if (count > 0) {
+        cout << "\nFAILED\n";
+        cout << "\n"
+             << "!!!!!!!!!!!!!! Error = " << error << " in " << count << " / "
+             << ROWS_COMPONENT * COLS_COMPONENT << "\n";
+        return 1;
+      }
+    }
+
+    cout << "\nPASSED\n";
+    return 0;
+
+  } catch (sycl::exception const &e) {
+    cout << "Caught a synchronous SYCL exception: " << e.what() << "\n";
+    cout << "   If you are targeting an FPGA hardware, "
+            "ensure that your system is plugged to an FPGA board that is "
+            "set up correctly"
+         << "\n";
+    cout << "   If you are targeting the FPGA emulator, compile with "
+            "-DFPGA_EMULATOR"
+         << "\n";
+
+    terminate();
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/CMakeLists.txt
new file mode 100755
index 0000000000..5c0cea463c
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(DoubleBuffering)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/README.md b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/README.md
new file mode 100755
index 0000000000..31b7e3df37
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/README.md
@@ -0,0 +1,223 @@
+# Double Buffering to Overlap Kernel Execution with Buffer Transfers and Host Processing
+This FPGA tutorial demonstrates how to parallelize host-side processing and buffer transfers between host and device with kernel execution, which can improve overall application performance.
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. 
+
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               | How and when to implement the double buffering optimization technique
+| Time to complete                  | 30 minutes
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+In an application where the FPGA kernel is executed multiple times, the host must perform the following processing and buffer transfers before each kernel invocation.
+1. The output data from the *previous* invocation must be transferred from device to host and then processed by the host. Examples of this processing include: 
+   * Copying the data to another location
+   * Rearranging the data 
+   * Verifying it in some way
+2. The input data for the *next* invocation must be processed by the host and then transferred to the device. Examples of this processing include: 
+   * Copying the data from another location 
+   * Rearranging the data for kernel consumption 
+   * Generating the data in some way
+
+Without double buffering, host processing and buffer transfers occur *between* kernel executions. Therefore, there is a gap in time between kernel executions, which you can refer to as kernel *downtime* (see diagram below). If these operations overlap with kernel execution, the kernels can execute back-to-back with minimal downtime, thereby increasing overall application performance.
+
+### Determining When is Double Buffering Possible
+
+Let's define the required variables:
+* **R** = Time to transfer the kernel's output buffer from device to host.
+* **Op** = Host-side processing time of kernel output data (*output processing*)
+* **Ip** = Host-side processing time for kernel input data (*input processing*)
+* **W** = Time to transfer the kernel's input buffer from host to device.
+* **K** = Kernel execution time
+
+![](downtime.png)
+
+In general, **R**, **Op**, **Ip**, and **W** operations must all complete before the next kernel is launched. To maximize performance, while one kernel is executing on the device, these operations should execute simultaneously on the host and operate on a second set of buffer locations. They should complete before the current kernel completes, thus allowing the next kernel to be launched immediately with no downtime. In general, to maximize performance, the host must launch a new kernel every **K**.
+
+This leads to the following constraint:
+
+```c++
+R + Op + Ip + W <= K, in order to minimize kernel downtime.
+```
+If the above constraint is not satisfied, a performance improvement may still be observed because *some* overlap (perhaps not complete overlap) is still possible. Further improvement is possible by extending the double buffering concept to N-way buffering (see the corresponding tutorial).
+
+### Measuring the Impact of Double Buffering
+
+You must get a sense of the kernel downtime to identify the degree to which this technique can help improve performance.
+
+This can be done by querying the total kernel execution time from the runtime and comparing it to the overall application execution time. In an application where kernels execute with minimal downtime, these two numbers will be close. However, if kernels have a lot of downtime, overall execution time will notably exceed kernel execution time. The tutorial code exemplifies how to do this.
+
+### Tutorial Implementation Notes
+
+The basic idea is to: 
+1. Perform the input processing for the first two kernel executions and queue them both. 
+2. Immediately call the `process_output()` method (automatically blocked by the SYCL* runtime) on the first kernel completing because of the implicit data dependency. 
+3. When the first kernel completes, the second kernel begins executing immediately because it was already queued. 
+4. While the second kernel runs, the host processes the output data from the first kernel and prepares the input data for the third kernel. 
+5. As long as the above operations complete before the second kernel completes, the third kernel is queued early enough to allow it to be launched immediately after the second kernel. 
+
+The process then repeats.
+
+The impact of double buffering on the total runtime of the tutorial program will be analyzed in the "Running the Sample" section below.
+
+## Key Concepts
+* The double buffering optimization technique 
+* Determining when double buffering is beneficial
+* How to measure the impact of double buffering
+
+## License  
+This code sample is licensed under MIT license.
+
+
+## Building the `double_buffering` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the optimization report: 
+     ```
+     make report
+     ``` 
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/double_buffering.fpga.tar.gz" download>here</a>.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. 
+
+1. Enter the source file directory.
+   ```
+   cd src
+   ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      ninja fpga_emu
+      ```
+
+   * Generate the optimization report:
+
+     ```
+     ninja report
+     ```
+     If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+     ```
+     ninja report_s10_pac
+     ```     
+   * Compiling for FPGA hardware is not yet supported on Windows.
+ 
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Examining the Reports
+Locate `report.html` in the `double_buffering_report.prj/reports/` or `double_buffering_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+Note that because the optimization described in this tutorial takes place at the *runtime* level, the FPGA compiler report will not show a difference between the optimized and unoptimized cases.
+
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./double_buffering.fpga_emu     (Linux)
+     double_buffering.fpga_emu.exe   (Windows)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./double_buffering.fpga         (Linux)
+     ```
+
+### Example of Output
+
+```
+Platform name: Intel(R) FPGA SDK for OpenCL(TM)
+Device name: pac_a10 : Intel PAC Platform (pac_ee00000)
+
+
+Executing kernel 100 times in each round.
+
+*** Beginning execution, without double buffering
+Launching kernel #0
+Launching kernel #10
+Launching kernel #20
+Launching kernel #30
+Launching kernel #40
+Launching kernel #50
+Launching kernel #60
+Launching kernel #70
+Launching kernel #80
+Launching kernel #90
+
+Overall execution time without double buffering = 29742 ms
+Total kernel-only execution time without double buffering = 17856 ms
+Throughput = 35.255249 MB/s
+
+
+*** Beginning execution, with double buffering.
+Launching kernel #0
+Launching kernel #10
+Launching kernel #20
+Launching kernel #30
+Launching kernel #40
+Launching kernel #50
+Launching kernel #60
+Launching kernel #70
+Launching kernel #80
+Launching kernel #90
+
+Overall execution time with double buffering = 17967 ms
+Total kernel-only execution time with double buffering = 17869 ms
+Throughput = 58.35976 MB/s
+
+
+Verification PASSED
+```
+
+### Discussion of Results
+
+A test compile of this tutorial design achieved a maximum frequency (f<sub>MAX</sub>) of approximately 340 MHz on the Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA. The results with and without double buffering are shown in the following table:
+
+Configuration | Overall Execution Time (ms) | Total Kernel Execution time (ms)
+-|-|-
+Without double buffering | 23462 | 15187
+With double buffering | 15145 | 15034
+
+In both runs, the total kernel execution time is similar, as expected. However, without double buffering, the overall execution time notably exceeds the total kernel execution time, implying there is downtime between kernel executions. With double buffering, the overall execution time is close to the the total kernel execution time.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.sln b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.sln
new file mode 100755
index 0000000000..4108b65da8
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "double_buffering", "double_buffering.vcxproj", "{6910A54A-BFE5-462F-9F3B-B84F62C5ADD1}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{6910A54A-BFE5-462F-9F3B-B84F62C5ADD1}.Debug|x64.ActiveCfg = Debug|x64
+		{6910A54A-BFE5-462F-9F3B-B84F62C5ADD1}.Debug|x64.Build.0 = Debug|x64
+		{6910A54A-BFE5-462F-9F3B-B84F62C5ADD1}.Release|x64.ActiveCfg = Release|x64
+		{6910A54A-BFE5-462F-9F3B-B84F62C5ADD1}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {1878B8F8-3C90-4CB5-9A71-66501FA4A3BA}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.vcxproj
new file mode 100755
index 0000000000..b7ee382578
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/double_buffering.vcxproj
@@ -0,0 +1,160 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\double_buffering.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{6910a54a-bfe5-462f-9f3b-b84f62c5add1}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>double_buffering</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)double_buffering.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)double_buffering.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/downtime.png b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/downtime.png
new file mode 100755
index 0000000000..2a306929bc
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/downtime.png differ
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/sample.json
new file mode 100755
index 0000000000..b10e6e185a
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/sample.json
@@ -0,0 +1,51 @@
+{
+  "guid": "B210B44F-FB86-4F42-BA4A-9980805350FF",
+  "name": "Overlapping Kernel Execution with Buffer Transfers and Host Processing through Double Buffering",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "FPGA tutorial design to demonstrate overlapping kernel execution with buffer transfers and host-processing to improve system performance",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "targetDevice": ["FPGA"],
+  "builder": ["ide", "cmake"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./double_buffering.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "double_buffering.fpga_emu.exe"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/CMakeLists.txt
new file mode 100755
index 0000000000..f918135042
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/CMakeLists.txt
@@ -0,0 +1,89 @@
+set(SOURCE_FILE double_buffering.cpp)
+set(TARGET_NAME double_buffering)
+
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga")
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+# fpga emulator
+if(WIN32)
+    set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+    add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+    separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+    add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+                   COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+                   DEPENDS ${SOURCE_FILE})
+else()
+    add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+    add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+
+# fpga
+if(WIN32)
+    add_custom_target(fpga
+                  COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+    set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS})
+    set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS})
+endif()
+
+# generate report
+if(WIN32)
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                   DEPENDS ${SOURCE_FILE})
+
+else()
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                   DEPENDS ${SOURCE_FILE})
+endif()
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}.fpga_emu
+                  DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/build.ninja
new file mode 100755
index 0000000000..3e8fdc6126
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/build.ninja
@@ -0,0 +1,30 @@
+source_file = double_buffering.cpp
+target_name = double_buffering
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+  command = dpcpp /GX ${emulator_flags} $in -o $out
+
+rule gen_report
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/double_buffering.cpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/double_buffering.cpp
new file mode 100755
index 0000000000..556507e307
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/double_buffering/src/double_buffering.cpp
@@ -0,0 +1,349 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <iomanip>
+#include <random>
+
+#include "dpc_common.hpp"
+
+using namespace sycl;
+
+// kTimes = # times to execute the kernel. kTimes must be >= 2
+// kSize = # of floats to process on each kernel execution.
+// run less in emulation to avoid high run time
+#if defined(FPGA_EMULATOR)
+constexpr int kTimes = 20;
+constexpr int kSize = 4096;
+#else
+constexpr int kTimes = 100;
+constexpr int kSize = 2621440;
+#endif
+
+// Kernel executes a power function (base^kPow). Must be
+// >= 2. Can increase this to increase kernel execution
+// time, but ProcessOutput() time will also increase.
+constexpr int kPow = 20;
+
+// Number of iterations through the main loop
+constexpr int kNumRuns = 2;
+
+bool pass = true;
+
+class SimpleVpow;
+
+/*  Kernel function.
+    Performs buffer_b[i] = buffer_a[i] ** pow
+    Only supports pow >= 2.
+    This kernel is not meant to be an optimal implementation of the power
+   operation -- it's just a sample kernel for this tutorial whose execution time
+   is easily controlled via the pow parameter. SYCL buffers are created
+   externally and passed in by reference to control (external to this function)
+   when the buffers are destructed. The destructor causes a blocking buffer
+   transfer from device to host and double buffering requires us to not block
+   here (because we need to launch another kernel). So we only want this
+   transfer to occur at the end of overall execution, not at the end of each
+   individual kernel execution.
+*/
+void SimplePow(std::unique_ptr<queue> &q, buffer<float, 1> &buffer_a,
+               buffer<float, 1> &buffer_b, event &e) {
+  // Submit to the queue and execute the kernel
+  e = q->submit([&](handler &h) {
+    // Get kernel access to the buffers
+    auto accessor_a = buffer_a.get_access<access::mode::read>(h);
+    auto accessor_b = buffer_b.get_access<access::mode::discard_read_write>(h);
+
+    const int num = kSize;
+    assert(kPow >= 2);
+    const int p = kPow - 1;  // Assumes pow >= 2;
+
+    h.single_task<SimpleVpow>([=]() [[intel::kernel_args_restrict]] {
+      for (int j = 0; j < p; j++) {
+        if (j == 0) {
+          for (int i = 0; i < num; i++) {
+            accessor_b[i] = accessor_a[i] * accessor_a[i];
+          }
+        } else {
+          for (int i = 0; i < num; i++) {
+            accessor_b[i] = accessor_b[i] * accessor_a[i];
+          }
+        }
+      }
+    });
+  });
+
+  event update_host_event;
+  update_host_event = q->submit([&](handler &h) {
+    auto accessor_b = buffer_b.get_access<access::mode::read>(h);
+
+    /*
+      Explicitly instruct the SYCL runtime to copy the kernel's output buffer
+      back to the host upon kernel completion. This is not required for
+      functionality since the buffer access in ProcessOutput() also implicitly
+      instructs the runtime to copy the data back. But it should be noted that
+      this buffer access blocks ProcessOutput() until the kernel is complete
+      and the data is copied. In contrast, update_host() instructs the runtime
+      to perform the copy earlier. This allows ProcessOutput() to optionally
+      perform more useful work *before* making the blocking buffer access. Said
+      another way, this allows ProcessOutput() to potentially perform more work
+      in parallel with the runtime's copy operation.
+    */
+    h.update_host(accessor_b);
+  });
+}
+
+// Returns kernel execution time for a given SYCL event from a queue.
+ulong SyclGetExecTimeNs(event e) {
+  ulong start_time =
+      e.get_profiling_info<info::event_profiling::command_start>();
+  ulong end_time =
+      e.get_profiling_info<info::event_profiling::command_end>();
+  return (end_time - start_time);
+}
+
+// Local pow function for verifying results
+float MyPow(float input, int pow) {
+  return (pow == 0) ? 1 : input * MyPow(input, pow - 1);
+}
+
+/*  Compares kernel output against expected output. Only compares part of the
+   output so that this method completes quickly. This is done
+   intentionally/artificially keep host-processing time shorter than kernel
+   execution time. Grabs kernel output data from its SYCL buffer. Reading from
+   this buffer is a blocking operation that will block on the kernel completing.
+    Queries and records execution time of the kernel that just completed. This
+   is a natural place to do this because ProcessOutput() is blocked on kernel
+   completion.
+*/
+void ProcessOutput(buffer<float, 1> &input_buf,
+                   buffer<float, 1> &output_buf, int exec_number, event e,
+                   ulong &total_kernel_time_per_slot) {
+  auto input_buf_acc = input_buf.get_access<access::mode::read>();
+  auto output_buf_acc = output_buf.get_access<access::mode::read>();
+  int num_errors = 0;
+  int num_errors_to_print = 10;
+  /*  The use of update_host() in the kernel function allows for additional
+     host-side operations to be performed here, in parallel with the buffer copy
+     operation from device to host, before the blocking access to the output
+     buffer is made via output_buf_acc[]. To be clear, no real operations are
+     done here and this is just a note that this is the place
+      where you *could* do it. */
+  for (int i = 0; i < kSize / 8; i++) {
+    const bool out_valid = (MyPow(input_buf_acc[i], kPow) != output_buf_acc[i]);
+    if ((num_errors < num_errors_to_print) && out_valid) {
+      if (num_errors == 0) {
+        pass = false;
+        std::cout << "Verification failed on kernel execution # " << exec_number
+                  << ". Showing up to " << num_errors_to_print
+                  << " mismatches.\n";
+      }
+      std::cout << "Verification failed on kernel execution # " << exec_number
+                << ", at element " << i << ". Expected " << std::fixed
+                << std::setprecision(16) << MyPow(input_buf_acc[i], kPow)
+                << " but got " << output_buf_acc[i] << "\n";
+      num_errors++;
+    }
+  }
+
+  // At this point we know the kernel has completed,
+  // so can query the profiling data.
+  total_kernel_time_per_slot += SyclGetExecTimeNs(e);
+}
+
+/*
+    Generates input data for the next kernel execution. Only fills part of the
+   buffer so that this method completes quickly. This is done
+   intentionally/artificially keep host-processing time shorter than kernel
+   execution time. Writes the data into the associated SYCL buffer. The write
+   will block until the previous kernel execution, that is using this buffer,
+   completes.
+*/
+void ProcessInput(buffer<float, 1> &buf) {
+  // We are generating completely new input data, so can use discard_write()
+  // here to indicate we don't care about the SYCL buffer's current contents.
+  auto buf_acc = buf.get_access<access::mode::discard_write>();
+
+  // RNG seed
+  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+
+  // RNG engine
+  std::default_random_engine dre(seed);
+
+  // generate random numbers between 1 and 2
+  std::uniform_real_distribution<float> di(1.0f, 2.0f);
+
+  // Randomly generate a start value and increment from there.
+  // Compared to randomly generating every value, this is done to
+  // speed up this function a bit.
+  float start_val = di(dre);
+
+  for (int i = 0; i < kSize / 8; i++) {
+    buf_acc[i] = start_val;
+    start_val++;
+  }
+}
+
+int main() {
+// Create queue, get platform and device
+#if defined(FPGA_EMULATOR)
+    intel::fpga_emulator_selector device_selector;
+    std::cout << "\nEmulator output does not demonstrate true hardware "
+                 "performance. The design may need to run on actual hardware "
+                 "to observe the performance benefit of the optimization "
+                 "exemplified in this tutorial.\n\n";
+#else
+    intel::fpga_selector device_selector;
+#endif
+
+    try {
+      auto prop_list =
+          property_list{property::queue::enable_profiling()};
+      
+      std::unique_ptr<queue> q;
+      q.reset(new queue(device_selector, dpc_common::exception_handler, prop_list));
+
+      platform platform = q->get_context().get_platform();
+      device device = q->get_device();
+      std::cout << "Platform name: "
+                << platform.get_info<info::platform::name>().c_str() << "\n";
+      std::cout << "Device name: "
+                << device.get_info<info::device::name>().c_str() << "\n\n\n";
+
+      std::cout << "Executing kernel " << kTimes << " times in each round.\n\n";
+
+      // Create a vector to store the input/output SYCL buffers
+      std::vector<buffer<float, 1>> input_buf;
+      std::vector<buffer<float, 1>> output_buf;
+
+      // SYCL events for each kernel launch.
+      event sycl_events[2];
+
+      // In nanoseconds. Total execution time of kernels in a given slot.
+      ulong total_kernel_time_per_slot[2];
+
+      // Total execution time of all kernels.
+      ulong total_kernel_time = 0;
+
+      // Allocate vectors to store the host-side copies of the input data
+      // Create and allocate the SYCL buffers
+      for (int i = 0; i < 2; i++) {
+        input_buf.push_back(buffer<float, 1>(range<1>(kSize)));
+        output_buf.push_back(buffer<float, 1>(range<1>(kSize)));
+      }
+
+      /*
+        Main loop. This loop runs twice to show the performance difference without
+        and with double buffering.
+      */
+      for (int i = 0; i < kNumRuns; i++) {
+        for (int i = 0; i < 2; i++) {
+          total_kernel_time_per_slot[i] = 0;  // Initialize timers to zero.
+        }
+
+        switch (i) {
+          case 0: {
+            std::cout << "*** Beginning execution, without double buffering\n";
+            break;
+          }
+          case 1: {
+            std::cout << "*** Beginning execution, with double buffering.\n";
+            break;
+          }
+          default: {
+            std::cout << "*** Beginning execution.\n";
+          }
+        }
+
+        // Start the timer. This will include the time to process the input data
+        // for the first 2 kernel executions.
+        dpc_common::TimeInterval exec_time;
+
+        if (i == 0) {  // Single buffering
+          for (int i = 0; i < kTimes; i++) {
+            // Only print every few iterations, just to limit the prints.
+            if (i % 10 == 0) {
+              std::cout << "Launching kernel #" << i << "\n";
+            }
+
+            ProcessInput(input_buf[0]);
+            SimplePow(q, input_buf[0], output_buf[0], sycl_events[0]);
+            ProcessOutput(input_buf[0], output_buf[0], i, sycl_events[0],
+                          total_kernel_time_per_slot[0]);
+          }
+        } else {  // Double buffering
+          // Process input for first 2 kernel launches and queue them. Then block
+          // on processing the output of the first kernel.
+          ProcessInput(input_buf[0]);
+          ProcessInput(input_buf[1]);
+
+          std::cout << "Launching kernel #0\n";
+
+          SimplePow(q, input_buf[0], output_buf[0], sycl_events[0]);
+          for (int i = 1; i < kTimes; i++) {
+            if (i % 10 == 0) {
+              std::cout << "Launching kernel #" << i << "\n";
+            }  // Only print every few iterations, just to limit the prints.
+
+            // Launch the next kernel
+            SimplePow(q, input_buf[i % 2], output_buf[i % 2], sycl_events[i % 2]);
+
+            // Process output from previous kernel. This will block on kernel
+            // completion.
+            ProcessOutput(input_buf[(i - 1) % 2], output_buf[(i - 1) % 2], i,
+                          sycl_events[(i - 1) % 2],
+                          total_kernel_time_per_slot[(i - 1) % 2]);
+
+            // Generate input for the next kernel.
+            ProcessInput(input_buf[(i - 1) % 2]);
+          }
+
+          // Process output of the final kernel
+          ProcessOutput(input_buf[(kTimes - 1) % 2], output_buf[(kTimes - 1) % 2],
+                        i, sycl_events[(kTimes - 1) % 2],
+                        total_kernel_time_per_slot[(kTimes - 1) % 2]);
+        }
+
+        // Add up the overall kernel execution time.
+        total_kernel_time = 0;
+        for (int i = 0; i < 2; i++) {
+          total_kernel_time += total_kernel_time_per_slot[i];
+        }
+
+        // Stop the timer.
+        double time_span = exec_time.Elapsed();
+
+        std::cout << "\nOverall execution time "
+                  << ((i == 0) ? "without" : "with") << " double buffering = "
+                  << (unsigned)(time_span * 1000) << " ms\n";
+        std::cout << "Total kernel-only execution time "
+                  << ((i == 0) ? "without" : "with") << " double buffering = "
+                  << (unsigned)(total_kernel_time / 1000000) << " ms\n";
+        std::cout << "Throughput = " << std::setprecision(8)
+                  << (float)kSize * (float)kTimes * (float)sizeof(float) /
+                         (float)time_span / 1000000
+                  << " MB/s\n\n\n";
+      }
+      if (pass) {
+        std::cout << "Verification PASSED\n";
+      } else {
+        std::cout << "Verification FAILED\n";
+        return 1;
+      }
+    } catch (sycl::exception const& e) {
+      // Catches exceptions in the host code
+      std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+      
+      // Most likely the runtime couldn't find FPGA hardware!
+      if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+        std::cout << "If you are targeting an FPGA, please ensure that your "
+                     "system has a correctly configured FPGA board.\n";
+        std::cout << "If you are targeting the FPGA emulator, compile with "
+                     "-DFPGA_EMULATOR.\n"; 
+      }
+      std::terminate();
+    }
+  return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/CMakeLists.txt
new file mode 100755
index 0000000000..134e6d8534
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+
+cmake_minimum_required (VERSION 2.8)
+
+project(NWayBuffering)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/README.md b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/README.md
new file mode 100755
index 0000000000..d4fb12ba40
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/README.md
@@ -0,0 +1,297 @@
+
+# N-Way Buffering to Overlap Kernel Execution with Buffer Transfers and Host Processing
+
+This FPGA tutorial demonstrates how to parallelize host-side processing and buffer transfers between host and device with kernel execution to improve overall application performance. It is a generalization of the  'double buffering' technique, and can be used to perform this overlap even when the host-processing time exceeds kernel execution time.
+ 
+***Documentation***: The [FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a resource for general target-independent DPC++ programming. 
+ 
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               | How and when to apply the N-way buffering optimization technique
+| Time to complete                  | 30 minutes
+ 
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+N-Way buffering is a generalization of the double buffering optimization technique (see the "Double Buffering" FPGA tutorial). This system-level optimization enables kernel execution to occur in parallel with host-side processing and buffer transfers between host and device, improving application performance. N-way buffering can achieve this overlap even when the host-processing time exceeds kernel execution time.
+
+### Background
+
+In an application where the FPGA kernel is executed multiple-times, the host must perform the following processing and buffer transfers before each kernel invocation: 
+1. The output data from the *previous* invocation must be transferred from the device to host and then processed by the host. Examples of this processing include the following: 
+   * Copying the data to another location
+   * Rearranging the data 
+   * Verifying it in some way
+2. The input data for the *next* invocation must be processed by the host and then transferred to the device. Examples of this processing include: 
+   * Copying the data from another location
+   * Rearranging the data for kernel consumption
+   * Generating the data in some way
+
+Without the technique described in this tutorial, host processing and buffer transfers occur *between* kernel executions. Therefore, there is a gap in time between kernel executions, which you can refer to as kernel "downtime" (see diagram below). If these operations overlap with kernel execution, the kernels can execute back-to-back with minimal downtime, thereby increasing overall application performance.
+
+### N-Way Buffering
+
+This technique is referred to as *N-Way Buffering*,  but is frequently called *double buffering* in the most common case where N=2.
+
+Let's first define some variables:
+
+| Variable | Description |
+| ------ | ------ |
+| **R** | Time to transfer the kernel's output buffer from device to host. |
+| **Op** | Host-side processing time of kernel output data (*output processing*). | 
+| **Ip** | Host-side processing time for kernel input data (*input processing*). | 
+| **W** | Time to transfer the kernel's input buffer from host to device. | 
+| **K** | Kernel execution time. | 
+| **N** | Number of buffer sets used. | 
+| **C** | Number of host-side CPU cores. | 
+
+
+
+![](downtime.png)
+
+In general, the **R**, **Op**, **Ip**, and **W** operations must all complete before the next kernel is launched. To maximize performance, while one kernel is executing on the device, these operations should run in parallel and operate on a separate set of buffer locations. You should complete before the current kernel completes, thus allowing the next kernel to be launched immediately with no downtime. In general, to maximize performance, the host must launch a new kernel every **K**.
+
+If these host-side operations are executed serially, this leads to the following constraint:
+
+```c++
+R + Op + Ip + W <= K, to minimize kernel downtime.
+```
+
+In the above example, if the constraint is satisfied, the application requires two sets of buffers. In this case, **N**=2.
+
+However, the above constraint may not be satisfied in some applications (i.e., if host-processing takes longer than the kernel execution time).
+
+**NOTE**: A performance improvement may still be observed because kernel downtime may still be reduced (though perhaps not maximally reduced). 
+
+In this case, to further improve performance, the reduce host-processing time through multi-threading. Rather than executing the above operations serially, perform the input- and output-processing operations in parallel using two threads, leading to the following constraint:
+
+```c++
+Max (R+Op, Ip+W) <= K
+and
+R + W <= K, to minimize kernel downtime.
+````
+
+If the above constraint is still unsatisfied, the technique can be extended beyond two sets of buffers to **N** sets of buffers to help improve the degree of overlap. In this case, the constraint becomes:
+
+```c++
+Max (R + Op, Ip + W) <= (N-1)*K
+and
+R + W <= K, to minimize kernel downtime.
+```
+
+The idea of N-way buffering is to prepare **N** sets of kernel input buffers, launch **N** kernels, and when the first kernel completes, begin the subsequent host-side operations. These operations may take a long time (longer than **K**), but they do not cause kernel downtime because an additional **N**-1 kernels have already been queued and can launch immediately. By the time these first **N** kernels complete, the aforementioned host-side operations would have also completed and the **N**+1 kernel can be launched with no downtime. As additional kernels complete, corresponding host-side operations are launched on the host, in a parallel fashion, using multiple threads. Although the host operations take longer than **K**, if **N** is chosen correctly, they will complete with a period of **K**, which is required to ensure we can launch a new kernel every **K**. To reiterate, this scheme requires multi-threaded host-operations because the host must perform processing for up to **N** kernels in parallel in order to keep up. 
+
+The above formula can be used to calculate the **N** required to minimize downtime. However, there are some practical limits: 
+* **N** sets of buffers are required on both the host and device, therefore both must have the capacity for this many buffers. 
+* If the input and output processing operations are launched in separate threads, then (**N**-1)*2 cores are required, so **C** can be become the limiting factor.
+
+### Measuring the Impact of N-Way Buffering
+
+You must get a sense of the kernel downtime to identify the degree to which this technique can help improve performance.
+
+This can be done by querying total kernel execution time from the runtime and comparing it to with overall application execution time. In an application where kernels execute with minimal downtime, these two numbers are close. However, if kernels have a lot of downtime, overall execution time notably exceeds the kernel execution time. The tutorial code exemplifies how to do this.
+
+### Tutorial Implementation Notes
+
+The example code runs with multiple iterations to illustrate how performance improves as **N** increases and as multi-threading is used.
+
+It is useful to think of the execution space as having **N** slots where the slots execute in chronological order, and each slot has its own set of buffers on the host and device. At the beginning of execution, the host prepares the kernel input data for the **N** slots and launches **N** kernels. When slot-0 completes, slot-1 begins executing immediately because it was already queued. The host begins both the output and input processing for slot-0. These two operations must complete before the host can queue another kernel into slot-0. The same is true for all slots. 
+
+After each kernel is launched, the host-side operations (that occur *after* the kernel in that slot completes) are launched immediately from the `main()` program. They block until the kernel execution for that slot completes (this is enforced by the runtime).
+
+
+## Key Concepts
+* The N-way buffering optimization technique as a generalization of double buffering
+* Determining when N-way buffering is practical and beneficial
+* How to measure the impact of N-way buffering
+
+## License  
+This code sample is licensed under MIT license.
+
+
+## Building the `n_way_buffering` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the optimization report: 
+     ```
+     make report
+     ``` 
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/n_way_buffering.fpga.tar.gz" download>here</a>.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. 
+
+1. Enter the source file directory.
+   ```
+   cd src
+   ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      ninja fpga_emu
+      ```
+
+   * Generate the optimization report:
+
+     ```
+     ninja report
+     ```
+     If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+     ```
+     ninja report_s10_pac
+     ```     
+   * Compiling for FPGA hardware is not yet supported on Windows.
+ 
+ ### In Third-Party Integrated Development Environments (IDEs)
+ 
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Examining the Reports
+Locate `report.html` in the `n_way_buffering_report.prj/reports/` or `n_way_buffering_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+Note that because the optimization described in this tutorial takes place at the *runtime* level, the FPGA compiler report will not show a difference between the optimized and unoptimized cases.
+
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./n_way_buffering.fpga_emu     (Linux)
+     n_way_buffering.fpga_emu.exe   (Windows)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./n_way_buffering.fpga         (Linux)
+     ```
+
+### Example of Output
+
+```
+Platform name: Intel(R) FPGA SDK for OpenCL(TM)
+Device name: pac_a10 : Intel PAC Platform (pac_ec00000)
+
+
+Executing kernel 100 times in each round.
+
+*** Beginning execution, 1-way buffering, single-threaded host operations
+Launching kernel #0
+Launching kernel #10
+Launching kernel #20
+Launching kernel #30
+Launching kernel #40
+Launching kernel #50
+Launching kernel #60
+Launching kernel #70
+Launching kernel #80
+Launching kernel #90
+
+Overall execution time = 65915 ms
+Total kernel-only execution time = 17852 ms
+Throughput = 15.907802 MB/s
+
+
+*** Beginning execution, 1-way buffering, multi-threaded host operations.
+Launching kernel #0
+Launching kernel #10
+Launching kernel #20
+Launching kernel #30
+Launching kernel #40
+Launching kernel #50
+Launching kernel #60
+Launching kernel #70
+Launching kernel #80
+Launching kernel #90
+
+Overall execution time = 51814 ms
+Total kernel-only execution time = 17852 ms
+Throughput = 20.237082 MB/s
+
+
+*** Beginning execution, 2-way buffering, multi-threaded host operationss
+Launching kernel #0
+Launching kernel #10
+Launching kernel #20
+Launching kernel #30
+Launching kernel #40
+Launching kernel #50
+Launching kernel #60
+Launching kernel #70
+Launching kernel #80
+Launching kernel #90
+
+Overall execution time = 26109 ms
+Total kernel-only execution time = 17852 ms
+Throughput = 40.160442 MB/s
+
+
+*** Beginning execution, N=5-way buffering, multi-threaded host operations
+Launching kernel #0
+Launching kernel #10
+Launching kernel #20
+Launching kernel #30
+Launching kernel #40
+Launching kernel #50
+Launching kernel #60
+Launching kernel #70
+Launching kernel #80
+Launching kernel #90
+
+Overall execution time with N-way buffering = 18763 ms
+Total kernel-only execution time with N-way buffering = 17851 ms
+Throughput = 55.884682 MB/s
+
+
+Verification PASSED
+```
+
+### Discussion of Results
+
+A test compile of this tutorial design achieved an f<sub>MAX</sub> of approximately 340 MHz on the Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA. The results are shown in the following table:
+
+Configuration | Overall Execution Time (ms) | Total Kernel Execution time (ms)
+-|-|-
+1-way buffering, single-threaded | 64401 | 15187
+1-way buffering, multi-threaded | 53540 | 15187
+2-way buffering, multi-threaded | 27281 | 15187
+5-way buffering, multi-threaded | 16284 | 15188
+
+In all runs, the total kernel execution time is similar, as expected. In the first three configurations, the overall execution time notably exceeds the total kernel execution time, implying there is downtime between kernel executions. However, as we switch from single-threaded to multi-threaded host operations and increase the number of buffer sets used, the overall execution time approaches the kernel execution time.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/downtime.png b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/downtime.png
new file mode 100755
index 0000000000..2a306929bc
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/downtime.png differ
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.sln b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.sln
new file mode 100755
index 0000000000..5a77b3049a
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "n_way_buffering", "n_way_buffering.vcxproj", "{49E7063B-56DA-4ACF-B153-5B56A98645BE}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{49E7063B-56DA-4ACF-B153-5B56A98645BE}.Debug|x64.ActiveCfg = Debug|x64
+		{49E7063B-56DA-4ACF-B153-5B56A98645BE}.Debug|x64.Build.0 = Debug|x64
+		{49E7063B-56DA-4ACF-B153-5B56A98645BE}.Release|x64.ActiveCfg = Release|x64
+		{49E7063B-56DA-4ACF-B153-5B56A98645BE}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {CC320E26-0D79-434A-8E69-3F09BFB2FCF4}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.vcxproj
new file mode 100755
index 0000000000..dff6f99529
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/n_way_buffering.vcxproj
@@ -0,0 +1,160 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\n_way_buffering.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{49e7063b-56da-4acf-b153-5b56a98645be}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>n_way_buffering</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)n_way_buffering.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)n_way_buffering.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/sample.json
new file mode 100755
index 0000000000..dffbded768
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/sample.json
@@ -0,0 +1,51 @@
+{
+  "guid": "2100C9BD-331C-475B-9878-4D14AAF0981D",
+  "name": "Overlapping Kernel Execution with Buffer Transfers and Host-Processing through N-Way Buffering",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "FPGA tutorial design to demonstrate overlapping kernel execution with buffer transfers and multi-threaded host-processing to improve system performance",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "targetDevice": ["FPGA"],
+  "builder": ["ide", "cmake"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./n_way_buffering.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "n_way_buffering.fpga_emu.exe"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/CMakeLists.txt
new file mode 100755
index 0000000000..cf12b30f72
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/CMakeLists.txt
@@ -0,0 +1,93 @@
+set(SOURCE_FILE n_way_buffering.cpp)
+set(TARGET_NAME n_way_buffering)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS " -lpthread -fintelfpga")
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+# fpga emulator
+if(WIN32)
+    set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+    add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+    separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+    add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+                   COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+                   DEPENDS ${SOURCE_FILE})
+else()
+	add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+	add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+	set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+	set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpga
+if(WIN32)
+    add_custom_target(fpga
+                  COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+set(FPGA_OBJ_FILE "dev_fpga.o")
+	add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+
+	add_custom_command(OUTPUT ${FPGA_OBJ_FILE} 
+					   COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} -fintelfpga -c ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${FPGA_OBJ_FILE} 
+					   DEPENDS ${SOURCE_FILE})
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+	add_custom_command(OUTPUT ${FPGA_TARGET} 
+        COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} ${FPGA_OBJ_FILE} -o  ${CMAKE_BINARY_DIR}/${FPGA_TARGET} -lpthread
+					   DEPENDS ${FPGA_OBJ_FILE})
+endif()
+
+
+# report
+if(WIN32)
+	set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+	add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+	add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+					   DEPENDS ${SOURCE_FILE})
+
+else()
+	set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+	add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+	add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+        COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+					   DEPENDS ${SOURCE_FILE})
+endif()
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}.fpga_emu
+                  DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/build.ninja
new file mode 100755
index 0000000000..80284aff9b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/build.ninja
@@ -0,0 +1,30 @@
+source_file = n_way_buffering.cpp
+target_name = n_way_buffering
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+  command = dpcpp /GX ${emulator_flags} $in -o $out
+
+rule gen_report
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/n_way_buffering.cpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/n_way_buffering.cpp
new file mode 100755
index 0000000000..c5428348db
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/n_way_buffering/src/n_way_buffering.cpp
@@ -0,0 +1,437 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <iomanip>
+#include <random>
+#include <thread>
+#include "dpc_common.hpp"
+
+using namespace sycl;
+
+// N-way buffering. N must be >= 1.
+constexpr int kLocalN = 5;
+
+// # times to execute the kernel. kTimes must be >= kLocalN
+#if defined(FPGA_EMULATOR) 
+constexpr int kTimes = 20;
+#else
+constexpr int kTimes = 100;
+#endif
+
+// # of floats to process on each kernel execution.
+#if defined(FPGA_EMULATOR)
+constexpr int kSize = 4096;
+#else
+constexpr int kSize = 2621440;  // ~10MB
+#endif
+
+// Kernel executes a power function (base^kPow). Must be
+// >= 2. Can increase this to increase kernel execution
+// time, but ProcessOutput() time will also increase.
+constexpr int kPow = 20;
+
+// Number of iterations through the main loop
+constexpr int kNumRuns = 4;
+
+bool pass = true;
+
+class SimpleVpow;
+
+/*  Kernel function.
+    Performs buffer_b[i] = buffer_a[i] ** pow
+    Only supports pow >= 2.
+    This kernel is not meant to be an optimal implementation of the power
+   operation -- it's just a sample kernel for this tutorial whose execution time
+   is easily controlled via the pow parameter. SYCL buffers are created
+   externally and passed in by reference to control (external to this function)
+   when the buffers are destructed. The destructor causes a blocking buffer
+   transfer from device to host and N-way buffering requires us to not block
+   here (because we need to queue more kernels). So we only want this transfer
+   to occur at the end of overall execution, not at the end of each individual
+   kernel execution.
+*/
+void SimplePow(std::unique_ptr<queue> &q, buffer<float, 1> &buffer_a,
+               buffer<float, 1> &buffer_b, event &e) {
+  // Submit to the queue and execute the kernel
+  e = q->submit([&](handler &h) {
+    // Get kernel access to the buffers
+    auto accessor_a = buffer_a.get_access<access::mode::read>(h);
+    auto accessor_b = buffer_b.get_access<access::mode::discard_read_write>(h);
+
+    const int num = kSize;
+    const int p = kPow - 1;  // Assumes pow >= 2;
+    assert(kPow >= 2);
+
+    h.single_task<class SimpleVpow>([=]() [[intel::kernel_args_restrict]] {
+      for (int j = 0; j < p; j++) {
+        if (j == 0) {
+          for (int i = 0; i < num; i++) {
+            accessor_b[i] = accessor_a[i] * accessor_a[i];
+          }
+        } else {
+          for (int i = 0; i < num; i++) {
+            accessor_b[i] = accessor_b[i] * accessor_a[i];
+          }
+        }
+      }
+    });
+  });
+
+  event update_host_event;
+  update_host_event = q->submit([&](handler &h) {
+    auto accessor_b = buffer_b.get_access<access::mode::read>(h);
+
+    /*
+      Explicitly instruct the SYCL runtime to copy the kernel's output buffer
+      back to the host upon kernel completion. This is not required for
+      functionality since the buffer access in ProcessOutput() also implicitly
+      instructs the runtime to copy the data back. But it should be noted that
+      this buffer access blocks ProcessOutput() until the kernel is complete
+      and the data is copied. In contrast, update_host() instructs the runtime
+      to perform the copy earlier. This allows ProcessOutput() to optionally
+      perform more useful work *before* making the blocking buffer access. Said
+      another way, this allows ProcessOutput() to potentially perform more work
+      in parallel with the runtime's copy operation.
+    */
+    h.update_host(accessor_b);
+  });
+  
+}
+
+// Returns kernel execution time for a given SYCL event from a queue.
+ulong SyclGetExecTimeNs(event e) {
+  ulong start_time =
+      e.get_profiling_info<info::event_profiling::command_start>();
+  ulong end_time =
+      e.get_profiling_info<info::event_profiling::command_end>();
+  return (end_time - start_time);
+}
+
+// Local pow function for verifying results
+float MyPow(float input, int pow) {
+  return (pow == 0) ? 1 : input * MyPow(input, pow - 1);
+}
+
+/*  Compares kernel output against expected output.
+    Grabs kernel output data from its SYCL buffer. Reading from this buffer is a
+   blocking operation that will block on the kernel completing. Grabs expected
+   output from a host-side copy of the input data. A copy is used to allow for
+   parallel generation of the input data for the next execution. Queries and
+   records execution time of the kernel that just completed. This is a natural
+   place to do this because ProcessOutput() is blocked on kernel completion.
+*/
+void ProcessOutput(buffer<float, 1> &output_buf,
+                   std::vector<float> &input_copy, int exec_number, event e,
+                   ulong &total_kernel_time_per_slot) {
+  auto output_buf_acc = output_buf.get_access<access::mode::read>();
+  int num_errors = 0;
+  int num_errors_to_print = 10;
+
+  /*  The use of update_host() in the kernel function allows for additional
+     host-side operations to be performed here, in parallel with the buffer copy
+     operation from device to host, before the blocking access to the output
+     buffer is made via output_buf_acc[]. To be clear, no real operations are
+     done here and this is just a note that this is the place
+      where you *could* do it. */
+  for (int i = 0; i < kSize; i++) {
+    bool out_valid = (MyPow(input_copy.data()[i], kPow) != output_buf_acc[i]);
+    if ((num_errors < num_errors_to_print) && out_valid) {
+      if (num_errors == 0) {
+        pass = false;
+        std::cout << "Verification failed on kernel execution # " << exec_number
+                  << ". Showing up to " << num_errors_to_print
+                  << " mismatches.\n";
+      }
+      std::cout << "Verification failed on kernel execution # " << exec_number
+                << ", at element " << i << ". Expected " << std::fixed
+                << std::setprecision(16) << MyPow(input_copy.data()[i], kPow)
+                << " but got " << output_buf_acc[i] << "\n";
+      num_errors++;
+    }
+  }
+
+  // At this point we know the kernel has completed, so can query the profiling
+  // data.
+  total_kernel_time_per_slot += SyclGetExecTimeNs(e);
+}
+
+/*
+    Generates input data for the next kernel execution.
+    Writes the data into the associated SYCL buffer. The write will block until
+   the previous kernel execution, that is using this buffer, completes. Writes a
+   copy of the data into a host-side buffer that will later be used by
+   ProcessOutput().
+*/
+void ProcessInput(buffer<float, 1> &buf, std::vector<float> &copy) {
+  // We are generating completely new input data, so can use discard_write()
+  // here to indicate we don't care about the SYCL buffer's current contents.
+  auto buf_acc = buf.get_access<access::mode::discard_write>();
+
+  // RNG seed
+  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+
+  // RNG engine
+  std::default_random_engine dre(seed);
+
+  // Values between 1 and 2
+  std::uniform_real_distribution<float> di(1.0f, 2.0f);
+
+  // Randomly generate a start value and increment from there.
+  // Compared to randomly generating every value, this is done to
+  // speed up this function a bit.
+  float start_val = di(dre);
+
+  for (int i = 0; i < kSize; i++) {
+    buf_acc[i] = start_val;
+    copy.data()[i] = start_val;
+    start_val++;
+  }
+}
+
+int main() {
+// Create queue, get platform and device
+#if defined(FPGA_EMULATOR)
+  intel::fpga_emulator_selector device_selector;
+  std::cout << "\nEmulator output does not demonstrate true hardware "
+               "performance. The design may need to run on actual hardware "
+               "to observe the performance benefit of the optimization "
+               "exemplified in this tutorial.\n\n";
+#else
+  intel::fpga_selector device_selector;
+#endif
+
+  try {
+    auto prop_list =
+        property_list{property::queue::enable_profiling()};
+    
+    std::unique_ptr<queue> q;
+    q.reset(new queue(device_selector, dpc_common::exception_handler, prop_list));
+
+    platform platform = q->get_context().get_platform();
+    device device = q->get_device();
+    std::cout << "Platform name: "
+              << platform.get_info<info::platform::name>().c_str() << "\n";
+    std::cout << "Device name: "
+              << device.get_info<info::device::name>().c_str() << "\n\n\n";
+
+    std::cout << "Executing kernel " << kTimes << " times in each round.\n\n";
+
+    // Create a vector to store the input/output SYCL buffers
+    std::vector<buffer<float, 1>> input_buf;
+    std::vector<buffer<float, 1>> output_buf;
+
+    // For every execution slot, we need 2 host-side buffers
+    // to store copies of the input data. One is used to
+    // verify the previous kernel's output. The other stores
+    // the new data for the next kernel execution.
+    std::vector<float> input_buf_copy[2 * kLocalN];
+
+    // SYCL events for each kernel launch.
+    event sycl_events[kLocalN];
+
+    // In nanoseconds. Total execution time of kernels in a given slot.
+    ulong total_kernel_time_per_slot[kLocalN];
+
+    // Total execution time of all kernels.
+    ulong total_kernel_time = 0;
+
+    // Threads to process the output from each kernel
+    std::thread t_process_output[kLocalN];
+
+    // Threads to process the input data for the next kernel
+    std::thread t_process_input[kLocalN];
+
+    // Demonstrate with 1-way buffering first, then N-way buffering.
+    int N;
+
+    // st = "single threaded".
+    // Used to enable multi-threading in subsequent runs.
+    bool st = true;
+
+    // Allocate vectors to store the host-side copies of the input data
+    for (int i = 0; i < 2 * kLocalN; i++) {
+      input_buf_copy[i] = std::vector<float>(kSize);
+    }
+
+    // Create and allocate the SYCL buffers
+    for (int i = 0; i < kLocalN; i++) {
+      input_buf.push_back(buffer<float, 1>(range<1>(kSize)));
+      output_buf.push_back(buffer<float, 1>(range<1>(kSize)));
+    }
+
+    /*
+      Main loop.
+      This loop runs multiple times to demonstrate how performance can be
+      improved by increasing the number of buffers as well as multi-threading
+      the host-side operations. The first iteration is a base run, demonstrating
+      the performance with none of these optimizations (ie. 1-way buffering,
+      single-threaded).
+    */
+    for (int i = 0; i < kNumRuns; i++) {
+      for (int i = 0; i < kLocalN; i++) {
+        total_kernel_time_per_slot[i] = 0;  // Initialize timers to zero.
+      }
+
+      switch (i) {
+        case 0: {
+          std::cout << "*** Beginning execution, 1-way buffering, "
+                       "single-threaded host operations\n";
+          N = 1;
+          st = true;
+          break;
+        }
+        case 1: {
+          std::cout << "*** Beginning execution, 1-way buffering, "
+                       "multi-threaded host operations.\n";
+          N = 1;
+          st = false;
+          break;
+        }
+        case 2: {
+          std::cout << "*** Beginning execution, 2-way buffering, "
+                       "multi-threaded host operationss\n";
+          N = 2;
+          st = false;
+          break;
+        }
+        case 3: {
+          std::cout << "*** Beginning execution, N=" << kLocalN
+                    << "-way buffering, multi-threaded host operations\n";
+          N = kLocalN;
+          st = false;
+          break;
+        }
+        default:
+          std::cout << "*** Beginning execution.\n";
+      }
+
+      // Start the timer. This will include the time to process the
+      // input data for the first N kernel executions.
+      dpc_common::TimeInterval exec_time;
+
+      // Process the input data for first N kernel executions. For
+      // multi-threaded runs, this is done in parallel.
+      for (int i = 0; i < N; i++) {
+        t_process_input[i] = std::thread(ProcessInput, std::ref(input_buf[i]),
+                                         std::ref(input_buf_copy[i]));
+        if (st) {
+          t_process_input[i].join();
+        }
+      }
+
+      /*
+        It's useful to think of the kernel execution space as having N slots.
+        Conceptually, the slots are executed chronologically sequentially on the
+        device (i.e. slot 0 to N-1). Each slot has its own buffering on both the
+        host and device. Before launching a kernel in a given slot, we must
+        process output data from the previous execution that occurred in that
+        slot and process new input data for the upcoming new execution in that
+        slot.
+      */
+      for (int i = 0; i < kTimes; i++) {
+        // The current slot is i%N.
+        // Before each kernel launch, the ProcessOutput() must have completed
+        // for the last execution in this slot. The ProcessInput() must also
+        // have completed for the upcoming new execution for this slot. Block on
+        // both of these.
+        if (!st) {
+          // ProcessOutput() is only relevant after the
+          // first N kernels have been launched.
+          if (i >= N) {
+            t_process_output[i % N].join();
+          }
+
+          t_process_input[i % N].join();
+        }
+
+        // Launch the kernel. This is non-blocking with respect to main().
+        // Only print every few iterations, just to limit the prints.
+        if (i % 10 == 0) {
+          std::cout << "Launching kernel #" << i << "\n";
+        }
+
+        SimplePow(q, input_buf[i % N], output_buf[i % N], sycl_events[i % N]);
+
+        // Immediately launch threads for the ProcessOutput() and
+        // ProcessInput() for *this* slot. These are non-blocking with respect
+        // to main(), but they will individually be blocked until the
+        // corresponding kernel execution is complete. The ProcessOutput()
+        // compares the kernel output data against the input data. But
+        // ProcessInput() will be overwriting that input data in parallel.
+        // Therefore ProcessOutput() must compare against an older copy of the
+        // data. We ping-pong between host-side copies of the input data.
+        t_process_output[i % N] = std::thread(
+            ProcessOutput, std::ref(output_buf[i % N]),
+            std::ref(input_buf_copy[i % (2 * N)]), i, sycl_events[i % N],
+            std::ref(total_kernel_time_per_slot[i % N]));
+
+        // For single-threaded runs, force single-threaded operation by
+        // blocking here immediately.
+        if (st) {
+          t_process_output[i % N].join();
+        }
+
+        // For the final N kernel launches, no need to process
+        // input data because there will be no more launches.
+        if (i < kTimes - N) {
+          // The indexes for the input_buf_copy used by ProcessOutput() and
+          // ProcessInput() are spaced N apart.
+          t_process_input[i % N] =
+              std::thread(ProcessInput, std::ref(input_buf[i % N]),
+                          std::ref(input_buf_copy[(i + N) % (2 * N)]));
+
+          if (st) {
+            t_process_input[i % N].join();
+          }
+        }
+      }
+
+      // Wait for the final N threads to finish and add up the overall kernel
+      // execution time.
+      total_kernel_time = 0;
+      for (int i = 0; i < N; i++) {
+        if (!st) {
+          t_process_output[i].join();
+        }
+        total_kernel_time += total_kernel_time_per_slot[i];
+      }
+
+      // Stop the timer.
+      double time_span = exec_time.Elapsed();
+
+      std::cout << "\nOverall execution time "
+                << ((i == kNumRuns - 1) ? ("with N-way buffering ") : "")
+                << "= " << (unsigned)(time_span * 1000) << " ms\n";
+      std::cout << "Total kernel-only execution time "
+                << ((i == kNumRuns - 1) ? ("with N-way buffering ") : "")
+                << "= " << (unsigned)(total_kernel_time / 1000000) << " ms\n";
+      std::cout << "Throughput = " << std::setprecision(8)
+                << (float)kSize * (float)kTimes * (float)sizeof(float) /
+                       (float)time_span / 1000000
+                << " MB/s\n\n\n";
+    }
+    if (pass) {
+      std::cout << "Verification PASSED\n";
+    } else {
+      std::cout << "Verification FAILED\n";
+      return 1;
+    }
+  } catch (sycl::exception const& e) {
+    // Catches exceptions in the host code
+    std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+    
+    // Most likely the runtime couldn't find FPGA hardware!
+    if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+      std::cout << "If you are targeting an FPGA, please ensure that your "
+                   "system has a correctly configured FPGA board.\n";
+      std::cout << "If you are targeting the FPGA emulator, compile with "
+                   "-DFPGA_EMULATOR.\n"; 
+    }
+    std::terminate();
+  }
+  return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/CMakeLists.txt
new file mode 100755
index 0000000000..4835f73b5f
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(LocalMemoryCache)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/README.md b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/README.md
new file mode 100755
index 0000000000..8a974787e4
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/README.md
@@ -0,0 +1,189 @@
+# Caching On-Chip Memory to Improve Loop Performance
+This FPGA tutorial demonstrates how to build a simple cache (implemented in FPGA registers) to store recently-accessed memory locations so that the compiler can achieve II=1 on critical loops in task kernels.
+
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. 
+
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               | How and when to implement the on-chip memory cache optimization
+| Time to complete                  | 30 minutes
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+In DPC++ task kernels for FPGA, it is always our objective to achieve an initiation interval (II) of 1 on performance-critical loops. This means that a new loop iteration is launched on every clock cycle, maximizing the throughput of the loop. 
+
+When the loop contains a loop-carried variable that is implemented in on-chip memory, the compiler often *cannot* achieve II=1 because the memory access takes more than one clock cycle. If the updated memory location may be needed on the next loop iteration, the next iteration must be delayed to allow time for the update, hence II > 1.
+
+The on-chip memory cache technique breaks this dependency by storing recently-accessed values in a cache capable of a 1-cycle read-modify-write operation. The cache is implemented in FPGA registers rather than on-chip memory. By pulling memory accesses preferentially from the register cache, the loop-carried dependency is broken.
+
+### When is the on-chip memory cache technique applicable?
+
+***Failure to achieve II=1 because of a loop-carried memory dependency in on-chip memory***:
+The on-chip memory cache technique is applicable if compiler could not pipeline a loop with II=1 because of an on-chip memory dependency. (If the compiler could not achieve II=1 because of a *global* memory dependency, this technique does not apply as the access latencies are too great.)
+
+To check this for a given design, view the "Loops Analysis" section of its optimization report. The report lists the II of all loops and explains why a lower II is not achievable. Check whether the reason given resembles "the compiler failed to schedule this loop with smaller II due to memory dependency". The report will describe the "most critical loop feedback path during scheduling". Check whether this includes on-chip memory load/store operations on the critical path.
+
+***An II=1 loop with a load operation of latency 1***:
+The compiler is capable of reducing the latency of on-chip memory accesses in order to achieve II=1. However, in doing so the compiler makes a trade-off, sacrificing f<sub>MAX</sub> to better optimize the loop. 
+
+In a design with II=1 critical loops but lower than desired f<sub>MAX</sub>, the on-chip memory cache technique may still be applicable. It can help recover f<sub>MAX</sub> by enabling the compiler to achieve II=1 with a higher latency memory access.
+
+To check whether this is the case for a given design, view the "Kernel Memory Viewer" section of the optimization report. Select the on-chip memory of interest from the Kernel Memory List, and mouse over the load operation "LD" to check its latency. If the latency of the load operation is 1, this is a clear sign that the compiler has attempted to sacrifice f<sub>MAX</sub> to better optimize a loop.
+
+
+### Implementing the on-chip memory cache technique
+
+The tutorial demonstrates the technique using a program that computes a histogram. The histogram operation accepts an input vector of values, separates the values into buckets, and counts the number of values per bucket. For each input value, an output bucket location is determined, and the count for the bucket is incremented. This count is stored in the on-chip memory and the increment operation requires reading from the memory, performing the increment, and storing the result. This read-modify-write operation is the critical path that can result in II > 1.
+
+To reduce II, the idea is to store recently-accessed values in an FPGA register-implemented cache that is capable of a 1-cycle read-modify-write operation. If the memory location required on a given iteration exists in the cache, it is pulled from there. The updated count is written back to *both* the cache and the on-chip memory. The `ivdep` pragma is added to inform the compiler that if a loop-carried variable (namely, the variable storing the histogram output) is needed within `CACHE_DEPTH` iterations, it is guaranteed to be available right away.
+
+### Selecting the cache depth
+
+While any value of `CACHE_DEPTH` results in functional hardware, the ideal value of `CACHE_DEPTH` requires some experimentation. The depth of the cache needs to roughly cover the latency of the on-chip memory access. To determine the correct value, it is suggested to start with a value of 2 and then increase it until both II = 1 and load latency > 1. In this tutorial, a `CACHE_DEPTH` of 5 is needed. 
+
+Each iteration takes only a few moments by running `make report` (refer to the section below on how to build the design). It is important to find the *minimal* value of `CACHE_DEPTH` that results in a maximal performance increase. Unnecessarily large values of `CACHE_DEPTH` consume unnecessary FPGA resources and can reduce f<sub>MAX</sub>. Therefore, at a `CACHE_DEPTH` that results in II=1 and load latency = 1, if further increases to `CACHE_DEPTH` show no improvement, then `CACHE_DEPTH` should not be increased any further.
+
+In the tutorial, two versions of the histogram kernel are implemented: one with and one without caching. The report shows II > 1 for the loop in the kernel without caching and II = 1 for the one with caching.
+
+## Key Concepts
+* How to implement the on-chip memory cache optimization technique
+* The scenarios in which this technique benefits performance 
+* How to tune the cache depth
+
+## License  
+This code sample is licensed under MIT license.
+
+
+## Building the `onchip_memory_cache` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the optimization report: 
+     ```
+     make report
+     ``` 
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/onchip_memory_cache.fpga.tar.gz" download>here</a>.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. 
+
+1. Enter the source file directory.
+   ```
+   cd src
+   ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      ninja fpga_emu
+      ```
+
+   * Generate the optimization report:
+
+     ```
+     ninja report
+     ```
+     If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+     ```
+     ninja report_s10_pac
+     ```     
+   * Compiling for FPGA hardware is not yet supported on Windows.
+ 
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+
+## Examining the Reports
+Locate `report.html` in the `onchip_memory_cache_report.prj/reports/` or `onchip_memory_cache_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+Compare the Loop Analysis reports with and without the onchip memory cache optimization, as described in the "When is the on-chip memory cache technique applicable?" section.
+
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./onchip_memory_cache.fpga_emu     (Linux)
+     onchip_memory_cache.fpga_emu.exe   (Windows)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./onchip_memory_cache.fpga         (Linux)
+     ```
+
+### Example of Output
+
+```
+Platform name: Intel(R) FPGA SDK for OpenCL(TM)
+Device name: pac_a10 : Intel PAC Platform (pac_ee00000)
+
+
+Number of inputs: 16777216
+Number of outputs: 64
+
+Beginning run without local memory caching.
+
+Verification PASSED
+
+Kernel execution time: 0.114106 seconds
+Kernel throughput without caching: 560.884047 MB/s
+
+Beginning run with local memory caching.
+
+Verification PASSED
+
+Kernel execution time: 0.059061 seconds
+Kernel throughput with caching: 1083.623184 MB/s
+```
+
+### Discussion of Results
+
+A test compile of this tutorial design achieved an f<sub>MAX</sub> of approximately 250 MHz on the Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA. The results are shown in the following table:
+
+Configuration | Execution Time (ms) | Throughput (MB/s)
+-|-|-
+Without caching | 0.153 | 418
+With caching | 0.08 | 809
+
+When caching is used, performance notably increases. As previously mentioned, this technique should result in an II reduction, which should lead to a throughput improvement. The technique can also improve f<sub>MAX</sub> if the compiler had previously implemented a latency=1 load operation, in which case the f<sub>MAX</sub> increase should result in a further throughput improvement.
+
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.sln b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.sln
new file mode 100755
index 0000000000..3df819f016
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "onchip_memory_cache", "onchip_memory_cache.vcxproj", "{66A01391-21D2-46BB-A37A-6B8670BEE1FC}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{66A01391-21D2-46BB-A37A-6B8670BEE1FC}.Debug|x64.ActiveCfg = Debug|x64
+		{66A01391-21D2-46BB-A37A-6B8670BEE1FC}.Debug|x64.Build.0 = Debug|x64
+		{66A01391-21D2-46BB-A37A-6B8670BEE1FC}.Release|x64.ActiveCfg = Release|x64
+		{66A01391-21D2-46BB-A37A-6B8670BEE1FC}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {E3206292-E99D-4ADC-B428-E0557E8070D4}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.vcxproj
new file mode 100755
index 0000000000..940683894e
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/onchip_memory_cache.vcxproj
@@ -0,0 +1,160 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\onchip_memory_cache.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{66a01391-21d2-46bb-a37a-6b8670bee1fc}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>onchip_memory_cache</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)onchip_memory_cache.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)onchip_memory_cache.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/sample.json
new file mode 100755
index 0000000000..a35ba679ac
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/sample.json
@@ -0,0 +1,51 @@
+{
+  "guid": "93DA332C-5490-4E4B-8038-BDEC1662A2D0",
+  "name": "Caching On-Chip Memory to Improve Loop Performance",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "FPGA tutorial demonstrating the caching of on-chip memory to reduce loop initiation interval.",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "builder": ["ide", "cmake"],
+  "targetDevice": ["FPGA"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./onchip_memory_cache.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "onchip_memory_cache.fpga_emu.exe"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/CMakeLists.txt
new file mode 100755
index 0000000000..9ed3cee584
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/CMakeLists.txt
@@ -0,0 +1,89 @@
+set(SOURCE_FILE onchip_memory_cache.cpp)
+set(TARGET_NAME onchip_memory_cache)
+
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga")
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+# fpga emulator
+if(WIN32)
+    set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+    add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+    separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+    add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+                   COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+                   DEPENDS ${SOURCE_FILE})
+else()
+    add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+    add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+
+# fpga
+if(WIN32)
+    add_custom_target(fpga
+                  COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+    set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS})
+    set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS})
+endif()
+
+# generate report
+if(WIN32)
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                   DEPENDS ${SOURCE_FILE})
+
+else()
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+    
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                   DEPENDS ${SOURCE_FILE})
+endif()
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}.fpga_emu
+                  DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/build.ninja
new file mode 100755
index 0000000000..94d90e092c
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/build.ninja
@@ -0,0 +1,30 @@
+source_file = onchip_memory_cache.cpp
+target_name = onchip_memory_cache
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+  command = dpcpp /GX ${emulator_flags} $in -o $out
+
+rule gen_report
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/onchip_memory_cache.cpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/onchip_memory_cache.cpp
new file mode 100755
index 0000000000..83b48eac97
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/onchip_memory_cache/src/onchip_memory_cache.cpp
@@ -0,0 +1,235 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <chrono>
+#include "dpc_common.hpp"
+
+using namespace sycl;
+
+constexpr int kInitNumInputs = 16 * 1024 * 1024;  // Default number of inputs.
+constexpr int kNumOutputs = 64;                   // Number of outputs
+constexpr int kInitSeed = 42;         // Seed for randomizing data inputs
+constexpr int kCacheDepth = 5;        // Depth of the cache.
+constexpr int kNumRuns = 2;           // runs twice to show the impact of cache
+constexpr double kNs = 1000000000.0;  // number of nanoseconds in a second
+
+template<bool use_cache>
+class Task;
+
+// This kernel function implements two data paths: with and without caching.
+// use_cache specifies which path to take.
+template<bool use_cache>
+void Histogram(std::unique_ptr<queue>& q, buffer<uint32_t>& input_buf,
+               buffer<uint32_t>& output_buf, event& e) {
+  // Enqueue  kernel
+  e = q->submit([&](handler& h) {
+    // Get accessors to the SYCL buffers
+    auto input = input_buf.get_access<access::mode::read>(h);
+    auto output = output_buf.get_access<access::mode::discard_write>(h);
+
+    h.single_task<Task<use_cache>>([=]() [[intel::kernel_args_restrict]] {
+
+      // On-chip memory for Histogram
+      uint32_t local_output[kNumOutputs];
+      uint32_t local_output_with_cache[kNumOutputs];
+
+      // Register-based cache of recently-accessed memory locations
+      uint32_t last_sum[kCacheDepth + 1];
+      uint32_t last_sum_index[kCacheDepth + 1];
+
+      // Initialize Histogram to zero
+      for (uint32_t b = 0; b < kNumOutputs; ++b) {
+        local_output[b] = 0;
+        local_output_with_cache[b] = 0;
+      }
+
+      // Compute the Histogram
+      if (!use_cache) {  // Without cache
+        for (uint32_t n = 0; n < kInitNumInputs; ++n) {
+          // Compute the Histogram index to increment
+          uint32_t b = input[n] % kNumOutputs;
+          local_output[b]++;
+        }
+      } else {  // With cache
+
+        // Specify that the minimum dependence-distance of
+        // loop carried variables is kCacheDepth.
+        [[intelfpga::ivdep(kCacheDepth)]] for (uint32_t n = 0;
+                                               n < kInitNumInputs; ++n) {
+          // Compute the Histogram index to increment
+          uint32_t b = input[n] % kNumOutputs;
+
+          // Get the value from the on-chip mem at this index.
+          uint32_t val = local_output_with_cache[b];
+
+          // However, if this location in on-chip mem was recently
+          // written to, take the value from the cache.
+          #pragma unroll
+          for (int i = 0; i < kCacheDepth + 1; i++) {
+            if (last_sum_index[i] == b) val = last_sum[i];
+          }
+
+          // Write the new value to both the cache and the on-chip mem.
+          last_sum[kCacheDepth] = local_output_with_cache[b] = val + 1;
+          last_sum_index[kCacheDepth] = b;
+
+          // Cache is just a shift register, so shift the shift reg. Pushing
+          // into the back of the shift reg is done above.
+          #pragma unroll
+          for (int i = 0; i < kCacheDepth; i++) {
+            last_sum[i] = last_sum[i + 1];
+            last_sum_index[i] = last_sum_index[i + 1];
+          }
+        }
+      }
+
+      // Write output to global memory
+      for (uint32_t b = 0; b < kNumOutputs; ++b) {
+        if (!use_cache) {
+          output[b] = local_output[b];
+        } else {
+          output[b] = local_output_with_cache[b];
+        }
+      }
+    });
+  });
+}
+
+int main() {
+  // Host and kernel profiling
+  event e;
+  ulong t1_kernel, t2_kernel;
+  double time_kernel;
+
+// Create queue, get platform and device
+#if defined(FPGA_EMULATOR)
+  intel::fpga_emulator_selector device_selector;
+  std::cout << "\nEmulator output does not demonstrate true hardware "
+               "performance. The design may need to run on actual hardware "
+               "to observe the performance benefit of the optimization "
+               "exemplified in this tutorial.\n\n";
+#else
+  intel::fpga_selector device_selector;
+#endif
+  try {
+    auto prop_list =
+        property_list{property::queue::enable_profiling()};
+    
+    std::unique_ptr<queue> q;
+    q.reset(new queue(device_selector, dpc_common::exception_handler, prop_list));
+
+    platform platform = q->get_context().get_platform();
+    device device = q->get_device();
+    std::cout << "Platform name: "
+              << platform.get_info<info::platform::name>().c_str() << "\n";
+    std::cout << "Device name: "
+              << device.get_info<info::device::name>().c_str() << "\n\n\n";
+
+    std::cout << "\nNumber of inputs: " << kInitNumInputs << "\n";
+    std::cout << "Number of outputs: " << kNumOutputs << "\n\n";
+
+    // Create input and output buffers
+    auto input_buf = buffer<uint32_t>(range<1>(kInitNumInputs));
+    auto output_buf = buffer<uint32_t>(range<1>(kNumOutputs));
+
+    srand(kInitSeed);
+
+    // Compute the reference solution
+    uint32_t gold[kNumOutputs];
+
+    {
+      // Get host-side accessors to the SYCL buffers
+      auto input_host = input_buf.get_access<access::mode::write>();
+      // Initialize random input
+      for (int i = 0; i < kInitNumInputs; ++i) {
+        input_host[i] = rand();
+      }
+
+      for (int b = 0; b < kNumOutputs; ++b) {
+        gold[b] = 0;
+      }
+      for (int i = 0; i < kInitNumInputs; ++i) {
+        int b = input_host[i] % kNumOutputs;
+        gold[b]++;
+      }
+    }
+
+    // Host accessor is now out-of-scope and is destructed. This is required
+    // in order to unblock the kernel's subsequent accessor to the same buffer.
+
+    for (int i = 0; i < kNumRuns; i++) {
+      switch (i) {
+        case 0: {
+          std::cout << "Beginning run without on-chip memory caching.\n\n";
+          Histogram<false>(q, input_buf, output_buf, e);
+          break;
+        }
+        case 1: {
+          std::cout << "Beginning run with on-chip memory caching.\n\n";
+          Histogram<true>(q, input_buf, output_buf, e);
+          break;
+        }
+        default: {
+          Histogram<false>(q, input_buf, output_buf, e);
+        }
+      }
+
+      // Wait for kernels to finish
+      q->wait();
+
+      // Compute kernel execution time
+      t1_kernel = e.get_profiling_info<info::event_profiling::command_start>();
+      t2_kernel = e.get_profiling_info<info::event_profiling::command_end>();
+      time_kernel = (t2_kernel - t1_kernel) / kNs;
+
+      // Get accessor to output buffer. Accessing the buffer at this point in
+      // the code will block on kernel completion.
+      auto output_host = output_buf.get_access<access::mode::read>();
+
+      // Verify output and print pass/fail
+      bool passed = true;
+      int num_errors = 0;
+      for (int b = 0; b < kNumOutputs; b++) {
+        if (num_errors < 10 && output_host[b] != gold[b]) {
+          passed = false;
+          std::cout << " (mismatch, expected " << gold[b] << ")\n";
+          num_errors++;
+        }
+      }
+
+      if (passed) {
+        std::cout << "Verification PASSED\n\n";
+
+        // Report host execution time and throughput
+        std::cout.setf(std::ios::fixed);
+        double N_MB = (kInitNumInputs * sizeof(uint32_t)) /
+                      (1024 * 1024);  // Input size in MB
+
+        // Report kernel execution time and throughput
+        std::cout << "Kernel execution time: " << time_kernel << " seconds\n";
+        std::cout << "Kernel throughput " << (i == 0 ? "without" : "with")
+                  << " caching: " << N_MB / time_kernel << " MB/s\n\n";
+      } else {
+        std::cout << "Verification FAILED\n";
+        return 1;
+      }
+    }
+  } catch (sycl::exception const& e) {
+    // Catches exceptions in the host code
+    std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+    
+    // Most likely the runtime couldn't find FPGA hardware!
+    if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+      std::cout << "If you are targeting an FPGA, please ensure that your "
+                   "system has a correctly configured FPGA board.\n";
+      std::cout << "If you are targeting the FPGA emulator, compile with "
+                   "-DFPGA_EMULATOR.\n"; 
+    }
+    std::terminate();
+  }
+  return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/CMakeLists.txt
new file mode 100755
index 0000000000..09e703741b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(PipeArray)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/README.md b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/README.md
new file mode 100755
index 0000000000..d292d6465f
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/README.md
@@ -0,0 +1,215 @@
+
+# Data Transfers Using Pipe Arrays
+This FPGA tutorial showcases a design pattern that makes it possible to create arrays of pipes.
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. 
+
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               | A design pattern to generate a array of pipes in DPC++ <br> Static loop unrolling through template metaprogramming
+| Time to complete                  | 15 minutes
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+In certain situations, it is useful to be able to create collection of pipes that can be indexed like an array in a DPC++ FPGA design. If you are not yet familiar with DPC++ pipes, refer to the prerequisite tutorial "Data Transfers Using Pipes".
+
+In SYCL*, each pipe defines a unique type with static methods for reading data (`read`) and writing data (`write`). Since pipes are not objects but *types*, defining a collection of pipes requires C++ template meta-programming. This is somewhat non-intuitive but yields highly efficient code.
+
+This tutorial provides a convenient pair of header files defining an abstraction for an array of pipes. The headers can be used in any DPC++ design and can be extended as necessary.
+
+### Example 1: A simple array of pipes
+
+To create an array of pipes, include the top-level header (from this code sample) in your design:
+
+```c++
+#include "pipe_array.hpp"
+```
+
+As with regular pipes, an array of pipes needs template parameters for an ID, for the `min_capacity` of each pipe, and for the data type of each pipe. An array of pipes additionally requires one or more template parameters to specify the array size. The following code declares a one dimensional array of 10 pipes, each with `capacity=32`, that operate on `int` values.
+
+```c++
+using MyPipeArray = PipeArray<     // Defined in "pipe_array.h".
+    class MyPipe,                  // An identifier for the pipe.
+    int,                           // The type of data in the pipe.
+    32,                            // The capacity of each pipe.
+    10,                            // array dimension.
+    >;
+```
+
+The uniqueness of a pipe array is derived from a combination of all template parameters.
+
+Indexing inside a pipe array can be done via the `PipeArray::PipeAt` type alias, as shown in the following code snippet:
+
+```c++
+MyPipeArray::PipeAt<3>::write(17);
+auto x = MyPipeArray::PipeAt<3>::read();
+```
+The template parameter `<3>` identifies a specific pipe within the array of pipes.  The index of the pipe being accessed *must* be determinable at compile time. 
+
+In most cases, we want to use an array of pipes so that we can iterate over them in a loop. In order to respect the requirement that all pipe indices are uniquely determinable at compile time, we must use a static form of loop unrolling based on C++ templates. A simple example is shown in the code snippet:
+
+```c++
+// Write 17 to every pipe in the array
+Unroller<0, 10>::Step([](auto i) {
+  MyPipeArray::PipeAt<i>::write(17);
+});
+```
+While this may initially feel foreign to those unaccustomed to C++ template metaprogramming, this is a simple and powerful pattern common to many C++ libraries. It is easy to reuse. In addition to `pipe_array.hpp`, this code sample includes a simple header file `unroller.hpp`, which implements the  `Unroller` functionality.
+
+### Example 2: A 2D array of pipes
+
+This code sample defines a `Producer` kernel that reads data from host memory and forwards this data into a two dimensional pipe matrix. 
+
+The following code snippet creates a two dimensional pipe array.
+``` c++
+constexpr size_t kNumRows = 2;
+constexpr size_t kNumCols = 2;
+constexpr size_t kDepth = 2;
+
+using ProducerToConsumerPipeMatrix = PipeArray<  // Defined in "pipe_array.h".
+    class ProducerConsumerPipe,                  // An identifier for the pipe.
+    uint64_t,                                    // The type of data in the pipe.
+    kDepth,                                      // The capacity of each pipe.
+    kNumRows,                                    // array dimension.
+    kNumCols                                     // array dimension.
+    >;
+```
+The producer kernel writes `num_passes` units of data into each of the `kNumRows * kNumCols` pipes. Note that the unrollers' lambdas must capture certain variables from their outer scope.
+
+```c++
+h.single_task<ProducerTutorial>([=]() {
+  size_t input_idx = 0;
+  for (size_t pass = 0; pass < num_passes; pass++) {
+    // Template-based unroll (outer "i" loop)
+    Unroller<0, kNumRows>::Step([&input_idx, input_accessor](auto i) {
+      // Template-based unroll (inner "j" loop)
+      Unroller<0, kNumCols>::Step([&input_idx, i, input_accessor](auto j) {
+        // Write a value to the <i,j> pipe of the pipe array
+        ProducerToConsumerPipeMatrix::PipeAt<i, j>::write(
+            input_accessor[input_idx++]);
+      });
+    });
+  }
+});
+```
+
+The code sample also defines an array of `Consumer` kernels that each read from a unique pipe in `ProducerToConsumerPipeMatrix`, process the data, and write the result to the host memory. 
+
+```c++
+// The consumer kernel reads from a single pipe, determined by consumer_id
+h.single_task<ConsumerTutorial<consumer_id>>([=]() {
+  constexpr size_t x = consumer_id / kNumCols;
+  constexpr size_t y = consumer_id % kNumCols;
+  for (size_t i = 0; i < num_elements; ++i) {
+    auto input = ProducerToConsumerPipeMatrix::PipeAt<x,y>::read();
+    uint64_t answer = ConsumerWork(input); // do some processing
+    output_accessor[i] = answer;
+  }
+});
+```
+
+The host must thus enqueue the producer kernel and `kNumRows * kNumCols` separate consumer kernels. The latter is achieved through another static unroll.
+```c++
+{
+  queue q(device_selector, dpc_common::exception_handler);
+
+  // Enqueue producer
+  buffer<uint64_t,1> producer_buffer(producer_input);
+  Producer(q, producer_buffer);
+  
+  // Use template-based unroll to enqueue multiple consumers    
+  std::vector<buffer<uint64_t,1>> consumer_buffers;
+  Unroller<0, kNumberOfConsumers>::Step([&](auto consumer_id) {
+    consumer_buffers.emplace_back(consumer_output[consumer_id].data(), items_per_consumer);
+    Consumer<consumer_id>(q, consumer_buffers.back());
+  });
+}
+```
+
+## Key Concepts
+* A design pattern to generate a array of pipes in DPC++
+* Static loop unrolling through template metaprogramming
+
+## License  
+This code sample is licensed under MIT license.
+
+
+## Building the `pipe_array` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the optimization report: 
+     ```
+     make report
+     ``` 
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/pipe_array.fpga.tar.gz" download>here</a>.
+
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+
+## Examining the Reports
+Locate `report.html` in the `pipe_array_report.prj/reports/` or `pipe_array_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+You can visualize the kernels and pipes generated by looking at the "System Viewer" section of the report. However, it is recommended that you first reduce the array dimensions `kNumRows` and `kNumCols` to small values (2 or 3) to facilitate visualization.
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./pipe_array.fpga_emu     (Linux)
+     pipe_array.fpga_emu.exe   (Windows)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./pipe_array.fpga         (Linux)
+     ```
+
+### Example of Output
+```
+Input Array Size:  1024
+Enqueuing producer...
+Enqueuing consumer 0...
+Enqueuing consumer 1...
+Enqueuing consumer 2...
+Enqueuing consumer 3...
+PASSED: The results are correct
+```
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/pipe_array.sln b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/pipe_array.sln
new file mode 100755
index 0000000000..efb4ff761f
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/pipe_array.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "pipe_array", "pipe_array.vcxproj", "{FA3FB2D1-BA98-4B4E-A8FA-A9BE6F8CA204}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{FA3FB2D1-BA98-4B4E-A8FA-A9BE6F8CA204}.Debug|x64.ActiveCfg = Debug|x64
+		{FA3FB2D1-BA98-4B4E-A8FA-A9BE6F8CA204}.Debug|x64.Build.0 = Debug|x64
+		{FA3FB2D1-BA98-4B4E-A8FA-A9BE6F8CA204}.Release|x64.ActiveCfg = Release|x64
+		{FA3FB2D1-BA98-4B4E-A8FA-A9BE6F8CA204}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {936BD366-28EA-4A45-B5CF-EE6630694F28}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/pipe_array.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/pipe_array.vcxproj
new file mode 100755
index 0000000000..5ebc0c86e4
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/pipe_array.vcxproj
@@ -0,0 +1,165 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\pipe_array.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="src\pipe_array.h" />
+    <ClInclude Include="src\pipe_array_internal.h" />
+    <ClInclude Include="src\unroller.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{fa3fb2d1-ba98-4b4e-a8fa-a9be6f8ca204}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>pipe_array</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)pipe_array.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)pipe_array.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/sample.json
new file mode 100755
index 0000000000..047514cfcc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/sample.json
@@ -0,0 +1,51 @@
+{
+  "guid": "11A61AF6-727E-4241-B5A0-CCCD0EF160B9",
+  "name": "Data Transfers Using Pipe Arrays",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "FPGA tutorial showcasing a design pattern to enables the creation of arrays of pipes.",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "targetDevice": ["FPGA"],
+  "builder": ["ide", "cmake"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./pipe_array.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "pipe_array.fpga_emu.exe"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/CMakeLists.txt
new file mode 100755
index 0000000000..0301dbed55
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/CMakeLists.txt
@@ -0,0 +1,91 @@
+set(SOURCE_FILE pipe_array.cpp)
+set(TARGET_NAME pipe_array)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga")
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+# fpga emulator
+if(WIN32)
+      set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+      add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+      separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+      add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+                   COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+                   DEPENDS ${SOURCE_FILE})
+
+else()
+      add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+      add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+      set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+      set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpga
+if(WIN32)
+     add_custom_target(fpga
+                  COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+    set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS})
+    set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS})
+endif()
+
+# report
+if(WIN32)
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                       DEPENDS ${SOURCE_FILE})
+
+else()
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/pipe_array.hpp pipe_array.hpp COPYONLY)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/unroller.hpp unroller.hpp COPYONLY)
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/pipe_array_internal.hpp pipe_array_internal.hpp COPYONLY)
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                       DEPENDS ${SOURCE_FILE} pipe_array.hpp unroller.hpp pipe_array_internal.hpp)
+endif()
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}.fpga_emu
+                  DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/build.ninja
new file mode 100755
index 0000000000..3ea2cc86e1
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/build.ninja
@@ -0,0 +1,30 @@
+source_file = pipe_array.cpp
+target_name = pipe_array
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware -std=c++14
+emulator_flags = -fintelfpga -DFPGA_EMULATOR -std=c++14
+
+rule build_fpga_emu
+  command = dpcpp /GX ${emulator_flags} $in -o $out
+
+rule gen_report
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array.cpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array.cpp
new file mode 100755
index 0000000000..e5bcbbaec1
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array.cpp
@@ -0,0 +1,177 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include "dpc_common.hpp"
+#include "pipe_array.hpp"
+#include "unroller.hpp"
+
+using namespace sycl;
+
+constexpr size_t kNumRows = 2;
+constexpr size_t kNumCols = 2;
+constexpr size_t kNumberOfConsumers = kNumRows * kNumCols;
+constexpr size_t kDepth = 2;
+
+using ProducerToConsumerPipeMatrix = PipeArray<  // Defined in "pipe_array.h".
+    class ProducerConsumerPipe,                  // An identifier for the pipe.
+    uint64_t,  // The type of data in the pipe.
+    kDepth,    // The capacity of each pipe.
+    kNumRows,  // array dimension.
+    kNumCols   // array dimension.
+    >;
+
+// Forward declaration of the kernel name
+// (This will become unnecessary in a future compiler version.)
+class ProducerTutorial;
+template <size_t consumer_id> class ConsumerTutorial;
+
+void Producer(queue &q, buffer<uint64_t, 1> &input_buffer) {
+  std::cout << "Enqueuing producer...\n";
+
+  auto e = q.submit([&](handler &h) {
+    auto input_accessor = input_buffer.get_access<access::mode::read>(h);
+    auto num_elements = input_buffer.get_count();
+    auto num_passes = num_elements / kNumberOfConsumers;
+
+    // The producer kernel writes to every pipe in the 2D pipe array
+    h.single_task<ProducerTutorial>([=]() {
+      size_t input_idx = 0;
+      for (size_t pass = 0; pass < num_passes; pass++) {
+        // Template-based unroll (outer "i" loop)
+        Unroller<0, kNumRows>::Step([&input_idx, input_accessor](auto i) {
+          // Template-based unroll (inner "j" loop)
+          Unroller<0, kNumCols>::Step([&input_idx, &i, input_accessor](auto j) {
+            // Write a value to the <i,j> pipe of the pipe array
+            ProducerToConsumerPipeMatrix::PipeAt<i, j>::write(
+                input_accessor[input_idx++]);
+          });
+        });
+      }
+    });
+  });
+}
+
+// Do some work on the data (any function could be substituted)
+uint64_t ConsumerWork(uint64_t i) { return i * i; }
+
+template <size_t consumer_id>
+void Consumer(queue &q, buffer<uint64_t, 1> &out_buf) {
+  std::cout << "Enqueuing consumer " << consumer_id << "...\n";
+
+  auto e = q.submit([&](handler &h) {
+    auto output_accessor = out_buf.get_access<access::mode::discard_write>(h);
+    auto num_elements = out_buf.get_count();
+
+    // The consumer kernel reads from a single pipe, determined by consumer_id
+    h.single_task<ConsumerTutorial<consumer_id>>([=]() {
+      constexpr size_t consumer_x = consumer_id / kNumCols;
+      constexpr size_t consumer_y = consumer_id % kNumCols;
+      for (size_t i = 0; i < num_elements; ++i) {
+        auto input = ProducerToConsumerPipeMatrix::PipeAt<consumer_x,
+                                                          consumer_y>::read();
+        uint64_t answer = ConsumerWork(input);
+        output_accessor[i] = answer;
+      }
+    });
+  });
+}
+
+int main(int argc, char *argv[]) {
+  uint64_t array_size = 1;
+  array_size <<= 10;
+
+  // Parse optional data size argument
+  if (argc > 1) {
+    std::string option(argv[1]);
+    if (option == "-h" || option == "--help") {
+      std::cout << "Usage: \n<executable> <data size>\n\nFAILED\n";
+      return 1;
+    } else {
+      array_size = std::stoi(option);
+    }
+  }
+
+  std::cout << "Input Array Size:  " << array_size << "\n";
+
+  // Check input validity
+  if (array_size % kNumberOfConsumers != 0) {
+    std::cout << "Array size must be a multiple of the number of consumers! "
+                 "Exiting...\n";
+    return 0;
+  }
+
+  // Set up producer input vector, and kNumberOfConsumers output vectors
+  uint64_t items_per_consumer = array_size / kNumberOfConsumers;
+  std::vector<uint64_t> producer_input(array_size, -1);
+  std::array<std::vector<uint64_t>, kNumberOfConsumers> consumer_output;
+
+  for (auto &output : consumer_output)
+    output.resize(items_per_consumer, -1);
+
+  // Initialize producer input 
+  for (size_t i = 0; i < array_size; i++)
+    producer_input[i] = i;
+
+#if defined(FPGA_EMULATOR)
+  intel::fpga_emulator_selector device_selector;
+#else
+  intel::fpga_selector device_selector;
+#endif
+
+  try {
+    queue q(device_selector, dpc_common::exception_handler);
+
+    // Enqueue producer
+    buffer<uint64_t,1> producer_buffer(producer_input);
+    Producer(q, producer_buffer);
+
+    // Use verbose SYCL 1.2 syntax for the output buffer.
+    // (This will become unnecessary in a future compiler version.)
+    std::vector<buffer<uint64_t,1>> consumer_buffers;
+
+    // Use template-based unroll to enqueue multiple consumers
+    Unroller<0, kNumberOfConsumers>::Step([&](auto consumer_id) {
+      consumer_buffers.emplace_back(consumer_output[consumer_id].data(),
+                                    items_per_consumer);
+      Consumer<consumer_id>(q, consumer_buffers.back());
+    });
+
+  } catch (sycl::exception const &e) {
+    // Catches exceptions in the host code
+    std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+
+    // Most likely the runtime couldn't find FPGA hardware!
+    if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+      std::cout << "If you are targeting an FPGA, please ensure that your "
+                   "system has a correctly configured FPGA board.\n";
+      std::cout << "If you are targeting the FPGA emulator, compile with "
+                   "-DFPGA_EMULATOR.\n";
+    }
+    std::terminate();
+  }
+
+  // Verify result
+  for (size_t i = 0; i < items_per_consumer; ++i) {
+    for (size_t consumer = 0; consumer < kNumberOfConsumers; ++consumer) {
+      auto fpga_result = consumer_output[consumer][i];
+      auto expected_result = ConsumerWork(kNumberOfConsumers * i + consumer);
+      if (fpga_result != expected_result) {
+        std::cout << "FAILED: The results are incorrect\n";
+        std::cout << "On Input: " << kNumberOfConsumers * i + consumer
+                  << " Expected: " << expected_result << " Got: " << fpga_result
+                  << "\n";
+        return 1;
+      }
+    }
+  }
+
+  std::cout << "PASSED: The results are correct\n";
+  return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array.hpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array.hpp
new file mode 100755
index 0000000000..cbcefd36b8
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array.hpp
@@ -0,0 +1,33 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <utility>
+
+#include "pipe_array_internal.hpp"
+
+template <class Id, typename BaseTy, size_t depth, size_t... dims>
+struct PipeArray {
+  PipeArray() = delete;
+
+  template <size_t... idxs>
+  struct StructId;
+
+  template <size_t... idxs>
+  struct VerifyIndices {
+    static_assert(sizeof...(idxs) == sizeof...(dims),
+                  "Indexing into a PipeArray requires as many indices as "
+                  "dimensions of the PipeArray.");
+    static_assert(VerifierDimLayer<dims...>::template VerifierIdxLayer<
+                      idxs...>::IsValid(),
+                  "Index out of bounds");
+    using VerifiedPipe =
+        cl::sycl::intel::pipe<StructId<idxs...>, BaseTy, depth>;
+  };
+
+  template <size_t... idxs>
+  using PipeAt = typename VerifyIndices<idxs...>::VerifiedPipe;
+};
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array_internal.hpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array_internal.hpp
new file mode 100755
index 0000000000..1b62f667f2
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/pipe_array_internal.hpp
@@ -0,0 +1,26 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+namespace {
+template <size_t dim1, size_t... dims>
+struct VerifierDimLayer {
+  template <size_t idx1, size_t... idxs>
+  struct VerifierIdxLayer {
+    static constexpr bool IsValid() {
+      return idx1 < dim1 &&
+             (VerifierDimLayer<dims...>::template VerifierIdxLayer<
+                 idxs...>::IsValid());
+    }
+  };
+};
+template <size_t dim>
+struct VerifierDimLayer<dim> {
+  template <size_t idx>
+  struct VerifierIdxLayer {
+    static constexpr bool IsValid() { return idx < dim; }
+  };
+};
+}  // namespace
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/unroller.hpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/unroller.hpp
new file mode 100755
index 0000000000..4bfb9422bd
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/pipe_array/src/unroller.hpp
@@ -0,0 +1,15 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+template <size_t it, size_t end> struct Unroller {
+  template <typename Action> static void Step(const Action &action) {
+    action(std::integral_constant<size_t, it>());
+    Unroller<it + 1, end>::Step(action);
+  }
+};
+
+template <size_t end> struct Unroller<end, end> {
+  template <typename Action> static void Step(const Action &) {}
+};
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/CMakeLists.txt
new file mode 100755
index 0000000000..367086979c
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(RemoveLoopCarriedDependency)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/README.md b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/README.md
new file mode 100755
index 0000000000..37e8edeeaf
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/README.md
@@ -0,0 +1,176 @@
+# Removing Loop Carried Dependencies
+This tutorial demonstrates how to remove a loop-carried dependency to improve the performance of FPGA device code.
+ 
+***Documentation***: The [FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a resource for general target-independent DPC++ programming. 
+ 
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               | A technique to remove loop carried dependencies from your FPGA device code, and when to apply it
+| Time to complete                  | 25 minutes
+ 
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+This tutorial demonstrates how to remove a loop-carried dependency in FPGA device code. A snippet of the baseline unoptimized code (the `Unoptimized` function in `src/loop_carried_dependency.cpp`) is given below:
+
+```
+double sum = 0;
+for (size_t i = 0; i < N; i++) {
+  for (size_t j = 0; j < N; j++) {
+    sum += a[i * N + j];
+  }
+  sum += b[i];
+}
+result[0] = sum;
+```
+
+In the unoptimized kernel, a sum is computed over two loops.  The inner loop sums over the `a` data and the outer loop over the `b` data. Since the value `sum` is updated in both loops, this introduces a _loop carried dependency_ that causes the outer loop to be serialized, allowing only one invocation of the outer loop to be active at a time, which reduces performance.
+
+A snippet of the optimized code (the `Optimized` function in `src/loop_carried_dependency.cpp`) is given below, which removes the loop carried dependency on the `sum` variable:
+
+```
+double sum = 0;
+ 
+for (size_t i = 0; i < N; i++) {
+  // Step 1: Definition
+  double sum_2 = 0;
+
+  // Step 2: Accumulation of array A values for one outer loop iteration
+  for (size_t j = 0; j < N; j++) {
+    sum_2 += a[i * N + j];
+  }
+
+  // Step 3: Addition of array B value for an outer loop iteration
+  sum += sum_2;
+  sum += b[i];
+}
+
+result[0] = sum;
+```
+
+The optimized kernel demonstrates the use of an independent variable `sum_2` that is not updated in the outer loop and removes the need to serialize the outer loop, which improves the performance.
+
+### When to Use This Technique
+Look at the _Compiler Report > Throughput Analysis > Loop Analysis_ section in the reports. The report lists the II and details for each loop. The technique presented in this tutorial may be applicable if the _Brief Info_ of the loop shows _Serial exe: Data dependency_.  The details pane may provide more information:
+```
+* Iteration executed serially across _function.block_. Only a single loop iteration will execute inside this region due to data dependency on variable(s):
+    * sum (_filename:line_)
+```
+
+## Key Concepts
+* Loop carried-dependencies, and their impact on FPGA DPC++ kernel performance
+* An optimization technique to break loop-carried data dependencies in critical loops
+
+## License  
+This code sample is licensed under MIT license.
+ 
+## Building the `loop_carried_dependency` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+ 
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+ 
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+ 
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+ 
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+ 
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the optimization report: 
+     ```
+     make report
+     ``` 
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/loop_carried_dependency.fpga.tar.gz" download>here</a>.
+ 
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A `build.ninja` file is provided instead. 
+ 
+1. Enter the source file directory.
+   ```
+   cd src
+   ```
+ 
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+ 
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      ninja fpga_emu
+      ```
+ 
+   * Generate the optimization report:
+ 
+     ```
+     ninja report
+     ```
+     If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+     ```
+     ninja report_s10_pac
+     ```     
+   * Compiling for FPGA hardware is not yet supported on Windows.
+
+
+### In Third-Party Integrated Development Environments (IDEs)
+ 
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Examining the Reports
+Locate `report.html` in the `loop_carried_dependency_report.prj/reports` or in `loop_carried_dependency_s10_pac_report.prj/reports` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+Navigate to the _Loops Analysis_ view of the report (under _Throughput Analysis_) and observe that the loop in block `UnOptKernel.B1` is showing _Serial exe: Data dependency_.  Click on the _source location_ field in the table to see the details for the loop. The maximum interleaving iterations of the loop is 1, as the loop is serialized.
+
+Now, observe that the loop in block `OptKernel.B1` is not marked as _Serialized_.  The maximum Interleaving iterations of the loop is now 12.
+
+## Running the Sample
+ 
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./loop_carried_dependency.fpga_emu     (Linux)
+     loop_carried_dependency.fpga_emu.exe   (Windows)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./loop_carried_dependency.fpga         (Linux)
+     ```
+
+### Example of Output
+```
+Number of elements: 16000
+Run: Unoptimized:
+kernel time : 10685.3 ms
+Run: Optimized:
+kernel time : 2736.47 ms
+PASSED
+```
+### Discussion of Results
+
+In the tutorial example, applying the optimization yields a total execution time reduction by almost a factor of 4.  The Initiation Interval (II) for the inner loop is 12 because a double floating point add takes 11 cycles on the FPGA.
+
+
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/loop_carried_dependency.sln b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/loop_carried_dependency.sln
new file mode 100755
index 0000000000..b319c23b37
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/loop_carried_dependency.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "loop_carried_dependency", "loop_carried_dependency.vcxproj", "{49E7063B-56DA-4ACF-B153-5B56A98645BE}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{49E7063B-56DA-4ACF-B153-5B56A98645BE}.Debug|x64.ActiveCfg = Debug|x64
+		{49E7063B-56DA-4ACF-B153-5B56A98645BE}.Debug|x64.Build.0 = Debug|x64
+		{49E7063B-56DA-4ACF-B153-5B56A98645BE}.Release|x64.ActiveCfg = Release|x64
+		{49E7063B-56DA-4ACF-B153-5B56A98645BE}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {CC320E26-0D79-434A-8E69-3F09BFB2FCF4}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/loop_carried_dependency.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/loop_carried_dependency.vcxproj
new file mode 100755
index 0000000000..0ef4b0a338
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/loop_carried_dependency.vcxproj
@@ -0,0 +1,160 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\loop_carried_dependency.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{49e7063b-56da-4acf-b153-5b56a98645be}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>loop_carried_dependency</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)loop_carried_dependency.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)loop_carried_dependency.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/sample.json
new file mode 100755
index 0000000000..de8f0bb430
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/sample.json
@@ -0,0 +1,51 @@
+{
+  "guid": "E5C1C1FA-7FDB-4C09-8096-1812080FD6D5",
+  "name": "Removing Loop Carried Dependencies",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "FPGA tutorial design demonstrating performance optimization by removing loop carried dependencies",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "targetDevice": ["FPGA"],
+  "builder": ["ide", "cmake"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./loop_carried_dependency.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "loop_carried_dependency.fpga_emu.exe"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/CMakeLists.txt
new file mode 100755
index 0000000000..e194b6f754
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/CMakeLists.txt
@@ -0,0 +1,88 @@
+set(SOURCE_FILE loop_carried_dependency.cpp)
+set(TARGET_NAME loop_carried_dependency)
+
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga")
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+# fpga emulator
+if(WIN32)
+    set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+    add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+    separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+    add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+                   COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+                   DEPENDS ${SOURCE_FILE})
+else()
+    add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+    add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpga
+if(WIN32)
+    add_custom_target(fpga
+                  COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+    set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS})
+    set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS})
+endif()
+
+# generate report
+if(WIN32)
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                   DEPENDS ${SOURCE_FILE})
+
+else()
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                   DEPENDS ${SOURCE_FILE})
+endif()
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}.fpga_emu
+                  DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/build.ninja
new file mode 100755
index 0000000000..fbbdd87caf
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/build.ninja
@@ -0,0 +1,30 @@
+source_file = loop_carried_dependency.cpp
+target_name = loop_carried_dependency
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+  command = dpcpp /GX ${emulator_flags} $in -o $out
+
+rule gen_report
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/loop_carried_dependency.cpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/loop_carried_dependency.cpp
new file mode 100755
index 0000000000..ab391a42c5
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/remove_loop_carried_dependency/src/loop_carried_dependency.cpp
@@ -0,0 +1,174 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include "dpc_common.hpp"
+
+using namespace sycl;
+using namespace std;
+
+// Forward declare the kernel names
+// (This will become unnecessary in a future compiler version.)
+class UnOptKernel;
+class OptKernel;
+
+event Unoptimized(queue &q, const vector<double> &vec_a,
+                  const vector<double> &vec_b, double &result, size_t N) {
+  buffer b_a(vec_a);
+  buffer b_b(vec_b);
+  buffer b_result(&result, range(1));
+
+  auto e = q.submit([&](handler &h) {
+    auto a = b_a.get_access<access::mode::read>(h);
+    auto b = b_b.get_access<access::mode::read>(h);
+    auto result = b_result.get_access<access::mode::discard_write>(h);
+
+    h.single_task<UnOptKernel>([=]() {
+      double sum = 0;
+      for (size_t i = 0; i < N; i++) {
+        for (size_t j = 0; j < N; j++) {
+          sum += a[i * N + j];
+        }
+        sum += b[i];
+      }
+      result[0] = sum;
+    });
+  });
+  return e;
+}
+
+event Optimized(queue &q, const vector<double> &vec_a,
+                const vector<double> &vec_b, double &result, size_t N) {
+  buffer b_a(vec_a);
+  buffer b_b(vec_b);
+  buffer b_result(&result, range(1));
+
+  auto e = q.submit([&](handler &h) {
+    auto a = b_a.get_access<access::mode::read>(h);
+    auto b = b_b.get_access<access::mode::read>(h);
+    auto result = b_result.get_access<access::mode::discard_write>(h);
+
+    h.single_task<OptKernel>([=]() [[intel::kernel_args_restrict]] {
+      double sum = 0;
+
+      for (size_t i = 0; i < N; i++) {
+        // Step 1: Definition
+        double sum_2 = 0;
+
+        // Step 2: Accumulation of array A values for one outer loop iteration
+        for (size_t j = 0; j < N; j++) {
+          sum_2 += a[i * N + j];
+        }
+
+        // Step 3: Addition of array B value for an outer loop iteration
+        sum += sum_2;
+        sum += b[i];
+      }
+
+      result[0] = sum;
+    });
+  });
+  return e;
+}
+
+void PrintTime(const event &e, queue &q, const char *kind) {
+  double start_k = e.get_profiling_info<info::event_profiling::command_start>();
+  double end_k = e.get_profiling_info<info::event_profiling::command_end>();
+  double kernel_time = (double)(end_k - start_k) * 1e-6;
+
+  cout << "Run: " << kind << ":\n";
+  cout << "kernel time : " << kernel_time << " ms\n";
+}
+
+int main(int argc, char *argv[]) {
+  size_t n = 16000;
+
+  if (argc > 1) {
+    string option(argv[1]);
+    if (option == "-h" || option == "--help") {
+      cout << "Usage: <executable> <data size>\n\nFAILED\n";
+      return 1;
+    } else {
+      n = stoi(option);
+    }
+  }
+  // Cap the value of n.
+  n = std::max(std::min((size_t)n, (size_t)16000), (size_t)100);
+  cout << "Number of elements: " << n << '\n';
+
+  vector<double> vec_a(n * n);
+  vector<double> vec_b(n);
+
+  double answer = 0;
+
+  // initialize data and compute golden result
+  for (size_t i = 0; i < n; i++) {
+    for (size_t j = 0; j < n; j++) {
+      vec_a[i * n + j] = i + j;
+      answer += i + j;
+    }
+    vec_b[i] = i;
+    answer += i;
+  }
+
+  // Initialize queue with device selector and enabling profiling
+  // Create queue, get platform and device
+#if defined(FPGA_EMULATOR)
+  intel::fpga_emulator_selector selector;
+  cout << "\nEmulator output does not demonstrate true hardware "
+          "performance. The design may need to run on actual hardware "
+          "to observe the performance benefit of the optimization "
+          "exemplified in this tutorial.\n\n";
+#else
+  intel::fpga_selector selector;
+#endif
+
+  double unopt_sum = -1, opt_sum = -1;
+
+  try {
+    // Create a profiling queue
+    queue q(selector, dpc_common::exception_handler,
+            property::queue::enable_profiling{});
+
+    // compute result on device
+    PrintTime(Unoptimized(q, vec_a, vec_b, unopt_sum, n), q, "Unoptimized");
+    PrintTime(Optimized(q, vec_a, vec_b, opt_sum, n), q, "Optimized");
+
+    // q's destructor invokes q's exception handler on any device exceptions.
+  } catch (sycl::exception const &e) {
+    // Catches exceptions in the host code
+    std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+
+    // Most likely the runtime couldn't find FPGA hardware!
+    if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+      std::cout << "If you are targeting an FPGA, please ensure that your "
+                   "system has a correctly configured FPGA board.\n";
+      std::cout << "If you are targeting the FPGA emulator, compile with "
+                   "-DFPGA_EMULATOR.\n";
+    }
+    std::terminate();
+  }
+
+  // Check the results
+  bool failed = false;
+  if (unopt_sum != answer) {
+    cout << "Unoptimized: expected: " << answer << ", result: " << unopt_sum
+         << '\n';
+    failed = true;
+  }
+  if (opt_sum != answer) {
+    cout << "Optimized: expected: " << answer << ", result: " << opt_sum
+         << '\n';
+    failed = true;
+  }
+
+  if (failed) {
+    cout << "FAILED\n";
+    return 1;
+  }
+  cout << "PASSED\n";
+  return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/CMakeLists.txt
new file mode 100755
index 0000000000..54283f46f7
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(TriangularLoop)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/README.md b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/README.md
new file mode 100755
index 0000000000..ad945c0d27
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/README.md
@@ -0,0 +1,295 @@
+
+# Triangular Loop Optimization
+
+This FPGA tutorial demonstrates an advanced technique to improve the performance of nested triangular loops with loop-carried dependencies in single-task kernels.
+ 
+***Documentation***: The [FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a resource for general target-independent DPC++ programming. 
+ 
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               | How and when to apply the triangular loop optimization technique
+| Time to complete                  | 30 minutes
+ 
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+
+This FPGA tutorial introduces an advanced optimization technique to improve the performance of nested triangular loops with loop-carried dependencies. Such structures are challenging to optimize because of the time-varying loop trip count.
+
+### What is a triangular loop?
+
+A triangular loop is a loop nest where the inner-loop range depends on the outer loop variable in such a way that the inner-loop trip-count shrinks or grows. This is best explained with an example:
+
+```c++
+  for (int x = 0; x < n; x++) {
+    for (int y = x + 1; y < n; y++) {
+      local_buf[y] = local_buf[y] + SomethingComplicated(local_buf[x]);
+    }
+  }
+```
+
+In this example, the inner-loop executes fewer and fewer iterations as overall execution progresses. Each iteration of the inner-loop performs a read from index `[x]` and a read-modify-write on indices `[y]=x+1` to `[y]=n-1`. Expressed graphically (with _n_=10), these operations look like:
+
+```c++
+    y=0 1 2 3 4 5 6 7 8 9  
+==========================
+x=0   o x x x x x x x x x 
+x=1     o x x x x x x x x
+x=2       o x x x x x x x
+x=3         o x x x x x x
+x=4           o x x x x x
+x=5             o x x x x
+x=6               o x x x
+x=7                 o x x
+x=8                   o x
+x=9       
+
+Legend: read="o", read-modify-write="x"
+```
+
+The picture is triangular in shape, hence the name "triangular loop".
+
+### Performance challenge
+
+In the above example, the table shows that in outer-loop iteration `x=0`, the program reads `local_buf[x=0]` and reads, modifies, and writes the values from `local_buf[y=1]` through `local_buf[y=9]`. This pattern of memory accesses results in a loop-carried dependency across the outer loop iterations. For example, the read at `x=2` depends on the value that was written at `x=1,y=2`. 
+
+Generally, a new iteration is launched on every cycle as long as a sufficient number of inner-loop 
+iterations are executed *between* any two iterations that are dependent on one another.
+
+However, the challenge in the triangular loop pattern is that the trip-count of the inner-loop
+progressively shrinks as `x` increments. In the worst case of `x=7`, the program writes to `local_buf[y=8]` in the first `y` iteration, but has only one intervening `y` iteration at `y=9` before the value must be read again at `x=8,y=8`. This may not allow enough time for the write operation to complete. The compiler compensates for this by increasing the initiation interval (II) of the inner-loop to allow more time to elapse between iterations. Unfortunately, this reduces the throughput of the inner-loop by a factor of II.
+
+A key observation is that this increased II is only functionally necessary when the inner-loop trip-count becomes small. Furthermore, the II of a loop is static -- it applies for all invocations of that loop. Therefore, if the *outer-loop* trip-count (_n_) is large, then most of the invocations of the inner-loop unnecessarily suffer the aforementioned throughput degradation. The optimization technique demonstrated in this tutorial addresses this issue.
+
+### Optimization concept
+
+The triangular loop optimization alters the code to guarantee that the trip count never falls below some minimum (_M_). This is accomplished by executing extra 'dummy' iterations of the inner loop when the *true* trip count falls below _M_. 
+
+The purpose of the dummy iterations is to allow extra time for the loop-carried dependency to resolve. No actual computation (or side effects) take place during these added iterations. Note that the extra iterations are only executed on inner loop invocations that require them. When the inner-loop trip count is large, extra iterations are not needed. 
+
+This technique allows the compiler to achieve II=1. 
+
+Applying the triangular loop optimization to the original example, the post-optimization execution graph for _M_=6 (with _n_=10) appears as follows:
+
+```c++
+    y=0 1 2 3 4 5 6 7 8 9 
+==========================
+x=0   o x x x x x x x x x   
+x=1     o x x x x x x x x   
+x=2       o x x x x x x x   
+x=3         o x x x x x x   
+x=4           o x x x x x   
+x=5           - o x x x x   
+x=6           - - o x x x   
+x=7           - - - o x x   
+x=8           - - - - o x   
+x=9          
+              <---M=6--->
+
+Legend: read="o", read-modify-write="x", dummy iteration="-"
+```
+
+### Selecting the value of _M_
+
+The objective is to find the minimal value of _M_ that enables the compiler to achieve an II of 1. Any value of _M_ larger than this minimum adds unnecessary latency to the computation.
+
+A good starting point of the value of _M_ is the II of the unoptimized inner loop, which can be found in the "Loop Analysis" report of the unoptimized code. If the compiler can achieve II=1 with this starting value, experiment with reducing _M_ until II increases. If the compiler does not achieve II=1, increase _M_ until it does. This search for the optimal _M_ can be done quickly, as the compiler takes little time to generate the static optimization report.
+
+### Applying the optimization in code
+
+Here is the triangular loop optimization of the original code snippet:
+```c++
+// Indices to track the execution in the merged loop
+int x = 0, y = 1;
+
+// Total iterations of the merged loop
+const int loop_bound = TotalIterations(M, n);
+
+[[intelfpga::ivdep(M)]] 
+for (int i = 0; i < loop_bound; i++) {
+
+  // Determine if this is a real or dummy iteration
+  bool compute = y > x;
+  if (compute) {
+    local_buf[y] = local_buf[y] + SomethingComplicated(local_buf[x]);
+  }
+  
+  y++;
+  if (y == n) {
+    x++;
+    y = Min(n - M, x + 1);
+  }
+}
+```
+This requires some explanation!
+
+***Single loop:*** Notice that the original nested loop has been manually coalesced or "merged" into a single loop. The explicit `x` and `y` induction variables are employed to achieve the triangular iteration pattern. The actual computation inside the loop is guarded by the condition `y > x`.
+
+***Merged loop trip count:*** The total trip-count of this merged loop is `loop_bound` in the snippet . The value of `loop_bound` is the total number of iterations in the execution graph diagram, which is a function of _n_ and _M_.
+
+To derive the expression for `TotalIterations(M, n)`, consider the iterations as consisting of the following two triangles of "real" and "dummy" iterations.
+
+```c++
+    y=0 1 2 3 4 5 6 7 8 9                     y=0 1 2 3 4 5 6 7 8 9
+=========================                 =========================
+x=0   o x x x x x x x x x                 x=0
+x=1     o x x x x x x x x                 x=1
+x=2       o x x x x x x x                 x=2
+x=3         o x x x x x x                 x=3
+x=4           o x x x x x       +         x=4
+x=5             o x x x x                 x=5           -
+x=6               o x x x                 x=6           - -
+x=7                 o x x                 x=7           - - -
+x=8                   o x                 x=8           - - - -
+x=9 
+                                                        <(M-2)>  
+                                                        <---M=6--->
+```
+The number of "real" iterations on the left is 10+9+8+7+6+5+4+3+2 = 54. The formula for a
+descending series from `n` is `n*(n+1)/2`. Since there is no iteration at `x=9,y=9`, subtract 1  (i.e., `n*(n+1)/2 - 1`). When _n_=10, this expression yields 54, as expected.
+
+The number of dummy iterations on the right is 4+3+2+1 = 10. The largest number in this series is _M_-2. Using the same formula for a descending series , you get `(M-2)*(M-1)/2`. For _M_=6, this this expression yields 4*5/2 = 10, as expected.
+
+Summing the number of real and dummy iterations gives the total iterations of the merged loop.
+
+***Use of ivdep***: Since the loop is restructured to ensure that a minimum of M iterations are executed, the  `[[intelfpga::ivdep(M)]]` is used to hint to the compiler that iterations with dependencies are always separated by at least M iterations.
+
+
+
+## Key Concepts
+* The triangular loop advanced optimization technique, and situations in which it is applicable
+* Using `ivdep safelen` to convey the broken loop-carried dependency to the compiler
+
+## License  
+This code sample is licensed under MIT license.
+
+
+## Building the `triangular_loop` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the optimization report: 
+     ```
+     make report
+     ``` 
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/triangular_loop.fpga.tar.gz" download>here</a>.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. 
+
+1. Enter the source file directory.
+   ```
+   cd src
+   ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      ninja fpga_emu
+      ```
+
+   * Generate the optimization report:
+
+     ```
+     ninja report
+     ```
+     If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+     ```
+     ninja report_s10_pac
+     ```     
+   * Compiling for FPGA hardware is not yet supported on Windows.
+ 
+ ### In Third-Party Integrated Development Environments (IDEs)
+ 
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Examining the Reports
+Locate `report.html` in the `triangular_loop_report.prj/reports/` or `triangular_loop_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+Consult the "Loop Analysis" report to compare the optimized and unoptimized versions of the loop.
+
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./triangular_loop.fpga_emu     (Linux)
+     triangular_loop.fpga_emu.exe   (Windows)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./triangular_loop.fpga         (Linux)
+     ```
+
+### Example of Output
+
+```
+Platform name: Intel(R) FPGA SDK for OpenCL(TM)
+Device name: pac_a10 : Intel PAC Platform (pac_ec00000)
+
+
+Length of input array: 8192
+
+Beginning run without triangular loop optimization.
+
+Verification PASSED
+
+Execution time: 4.240185 seconds
+Throughput without optimization: 30.187364 MB/s
+
+Beginning run with triangular loop optimization.
+
+Verification PASSED
+
+Execution time: 0.141516 seconds
+Throughput with optimization: 904.489876 MB/s
+
+```
+
+### Discussion of Results
+A test compile of this tutorial design achieved an f<sub>MAX</sub> of approximately 210 MHz on the Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA. The results with and without the optimization are shown in the following table:
+
+Configuration | Overall Execution Time (ms) | Throughput (MB/s)
+-|-|-
+Without optimization | 4972 | 25.7
+With optimization | 161 | 796.6
+
+Without optimization, the compiler achieved an II of 30 on the inner-loop. With the optimization, the compiler achieves an II of 1 and the throughput increased by approximately 30x.
+
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/sample.json
new file mode 100755
index 0000000000..7dc1d09170
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/sample.json
@@ -0,0 +1,51 @@
+{
+  "guid": "884439A5-0286-447B-9E6D-A7C22B61CED8",
+  "name": "Triangular Loop Optimization",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "FPGA tutorial demonstrating an advanced optimization technique for triangular loops",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "targetDevice": ["FPGA"],
+  "builder": ["ide", "cmake"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./triangular_loop.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "triangular_loop.fpga_emu.exe"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/CMakeLists.txt
new file mode 100755
index 0000000000..04d6c7add8
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/CMakeLists.txt
@@ -0,0 +1,88 @@
+set(SOURCE_FILE triangular_loop.cpp)
+set(TARGET_NAME triangular_loop)
+
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga")
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+# fpga emulator
+if(WIN32)
+    set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+    add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+    separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+    add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+                   COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+                   DEPENDS ${SOURCE_FILE})
+else()
+    add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+    add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpga
+if(WIN32)
+    add_custom_target(fpga
+                  COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+    set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS})
+    set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS})
+endif()
+
+# generate report
+if(WIN32)
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                   DEPENDS ${SOURCE_FILE})
+
+else()
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                   DEPENDS ${SOURCE_FILE})
+endif()
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}.fpga_emu
+                  DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/build.ninja
new file mode 100755
index 0000000000..f13a484a51
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/build.ninja
@@ -0,0 +1,30 @@
+source_file = triangular_loop.cpp
+target_name = triangular_loop
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+  command = dpcpp /GX ${emulator_flags} $in -o $out
+
+rule gen_report
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/triangular_loop.cpp b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/triangular_loop.cpp
new file mode 100755
index 0000000000..d3a5386bd6
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/src/triangular_loop.cpp
@@ -0,0 +1,255 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <chrono>
+#include "dpc_common.hpp"
+
+using namespace sycl;
+
+// Seed for randomizing data inputs
+constexpr int kInitSeed = 42;
+
+// This tutorial runs twice to show the impact with
+// and without the optimization.
+constexpr int kNumRuns = 2;
+
+// number of nanoseconds in a second
+constexpr double kNs = 1000000000.0;
+
+// Number of inputs. Don't set this too large, otherwise
+// computation of the reference solution will take a long time on
+// the host (the time is proportional to kSize^2)
+constexpr int kSize = 8 * 1024;
+
+// >=1. Minimum number of iterations of the inner loop that must be
+// executed in the optimized implementation. Set this approximately
+// equal to the ii of inner loop in the unoptimized implementation.
+constexpr int kM = 50;
+
+// do not use with unary operators, e.g., kMin(x++, y++)
+constexpr int Min(int X, int Y) { return (((X) < (Y)) ? (X) : (Y)); };
+
+// Forward declaration of kernel
+class Task;
+
+// This method represents the operation you perform on the loop-carried variable
+// in the triangular loop (i.e. a dot product or something that may take many
+// cycles to complete).
+int SomethingComplicated(int x) { return (int)sycl::sqrt((float)x); }
+
+// This kernel function implements two data paths: with and without the
+// optimization. 'optimize' specifies which path to take.
+void TriangularLoop(std::unique_ptr<queue>& q, buffer<uint32_t>& input_buf,
+                    buffer<uint32_t>& output_buf, uint32_t n, event& e,
+                    bool optimize) {
+  // Enqueue kernel
+  e = q->submit([&](handler& h) {
+    // Get accessors to the SYCL buffers
+    auto input = input_buf.get_access<access::mode::read>(h);
+    auto output = output_buf.get_access<access::mode::discard_write>(h);
+
+    h.single_task<Task>([=]() [[intel::kernel_args_restrict]] {
+      // See README for description of the loop_bound calculation.
+      const int real_iterations = (n * (n + 1) / 2 - 1);
+      const int extra_dummy_iterations = (kM - 2) * (kM - 1) / 2;
+      const int loop_bound = real_iterations + extra_dummy_iterations;
+
+      // Local memory for the buffer to be operated on
+      uint32_t local_buf[kSize];
+
+      // Read the input_buf from global mem and load it into the local mem
+      for (uint32_t i = 0; i < kSize; i++) {
+        local_buf[i] = input[i];
+      }
+
+      // Perform the triangular loop computation
+
+      if (!optimize) {  // Unoptimized loop.
+
+        for (int x = 0; x < n; x++) {
+          for (int y = x + 1; y < n; y++) {
+            local_buf[y] = local_buf[y] + SomethingComplicated(local_buf[x]);
+          }
+        }
+
+      } else {  // Optimized loop.
+
+        // Indices to track the execution inside the single, merged loop.
+        int x = 0, y = 1;
+
+        // Specify that the minimum dependence-distance of loop-carried
+        // variables is kM iterations. We ensure this is true by modifying the y
+        // index such that a minimum of kM iterations are always executed.
+        [[intelfpga::ivdep(kM)]] for (int i = 0; i < loop_bound; i++) {
+          // Determine if this iteration is a dummy iteration or a real
+          // iteration in which the computation should be performed.
+          bool compute = y > x;
+          // Perform the computation if needed.
+          if (compute) {
+            local_buf[y] = local_buf[y] + SomethingComplicated(local_buf[x]);
+          }
+          // Figure out the next value for the indices.
+          y++;
+
+          // If we've hit the end, set y such that a minimum of kM
+          // iterations are exected.
+          if (y == n) {
+            x++;
+            y = Min(n - kM, x + 1);
+          }
+        }
+      }
+
+      // Write the output to global mem
+      for (uint32_t i = 0; i < kSize; i++) {
+        output[i] = local_buf[i];
+      }
+    });
+  });
+
+}
+
+int main() {
+
+  // Host and kernel profiling
+  event e;
+  ulong t1_kernel, t2_kernel;
+  double time_kernel;
+// Create queue, get platform and device
+#if defined(FPGA_EMULATOR)
+  intel::fpga_emulator_selector device_selector;
+  std::cout << "\nEmulator output does not demonstrate true hardware "
+               "performance. The design may need to run on actual hardware "
+               "to observe the performance benefit of the optimization "
+               "exemplified in this tutorial.\n\n";
+#else
+  intel::fpga_selector device_selector;
+#endif
+
+  try {
+    auto prop_list =
+        property_list{property::queue::enable_profiling()};
+
+    std::unique_ptr<queue> q;
+    q.reset(new queue(device_selector, dpc_common::exception_handler, prop_list));
+
+    platform platform = q->get_context().get_platform();
+    device device = q->get_device();
+    std::cout << "Platform name: "
+              << platform.get_info<info::platform::name>().c_str() << "\n";
+    std::cout << "Device name: "
+              << device.get_info<info::device::name>().c_str() << "\n\n\n";
+
+    // Create input and output buffers
+    auto input_buf = buffer<uint32_t>(range<1>(kSize));
+    auto output_buf = buffer<uint32_t>(range<1>(kSize));
+
+    srand(kInitSeed);
+
+    // Compute the reference solution
+    uint32_t gold[kSize];
+
+    {
+      // Get host-side accessors to the SYCL buffers.
+      auto input_host = input_buf.get_access<access::mode::write>();
+
+      // Initialize random input
+      for (int i = 0; i < kSize; ++i) {
+        input_host[i] = rand() % 256;
+      }
+
+      for (int i = 0; i < kSize; ++i) {
+        gold[i] = input_host[i];
+      }
+    }
+
+    // Host accessor now out-of-scope and is destructed. This is required in
+    // order to unblock the kernel's subsequent accessor to the same buffer.
+
+    for (int x = 0; x < kSize; x++) {
+      for (int y = x + 1; y < kSize; y++) {
+        gold[y] += SomethingComplicated(gold[x]);
+      }
+    }
+
+    std::cout << "Length of input array: " << kSize << "\n\n";
+
+    for (int i = 0; i < kNumRuns; i++) {
+      switch (i) {
+        case 0: {
+          std::cout
+              << "Beginning run without triangular loop optimization.\n\n";
+          TriangularLoop(q, input_buf, output_buf, kSize, e, false);
+          break;
+        }
+        case 1: {
+          std::cout << "Beginning run with triangular loop optimization.\n\n";
+          TriangularLoop(q, input_buf, output_buf, kSize, e, true);
+          break;
+        }
+        default: {
+          TriangularLoop(q, input_buf, output_buf, kSize, e, false);
+        }
+      }
+
+      // Wait for kernels to finish
+      q->wait();
+
+      t1_kernel = e.get_profiling_info<info::event_profiling::command_start>();
+      t2_kernel = e.get_profiling_info<info::event_profiling::command_end>();
+      time_kernel = (t2_kernel - t1_kernel) / kNs;
+
+      // Get accessor to output buffer. Accessing the buffer at this point in
+      // the code will block on kernel completion.
+      auto output_host = output_buf.get_access<access::mode::read>();
+
+      // Verify output and print pass/fail
+      bool passed = true;
+      int num_errors = 0;
+      for (int b = 0; b < kSize; b++) {
+        if (num_errors < 10 && output_host[b] != gold[b]) {
+          passed = false;
+          std::cout << " Mismatch at element " << b << ". expected " << gold[b]
+                    << ")\n";
+          num_errors++;
+        }
+      }
+
+      if (passed) {
+        std::cout << "Verification PASSED\n\n";
+
+        // Report host execution time and throughput
+        std::cout.setf(std::ios::fixed);
+        std::cout << "Execution time: " << time_kernel << " seconds\n";
+        int num_iterations =
+            kSize * (kSize + 1) / 2 -
+            1;  // One piece of data is processed on each iteration. This
+                // formula is taken from the loop_bound calculation.
+        double N_MB = (sizeof(uint32_t) * num_iterations) /
+                      (1024 * 1024);  // Amount of data processed, in mB
+        std::cout << "Throughput " << (i == 0 ? "without" : "with")
+                  << " optimization: " << N_MB / time_kernel << " MB/s\n\n";
+      } else {
+        std::cout << "Verification FAILED\n";
+        return 1;
+      }
+    }
+  } catch (sycl::exception const& e) {
+    // Catches exceptions in the host code
+    std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+    
+    // Most likely the runtime couldn't find FPGA hardware!
+    if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+      std::cout << "If you are targeting an FPGA, please ensure that your "
+                   "system has a correctly configured FPGA board.\n";
+      std::cout << "If you are targeting the FPGA emulator, compile with "
+                   "-DFPGA_EMULATOR.\n"; 
+    }
+    std::terminate();
+  }
+  return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/triangular_loop.sln b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/triangular_loop.sln
new file mode 100755
index 0000000000..dba49d0132
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/triangular_loop.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "triangular_loop", "triangular_loop.vcxproj", "{B9324A38-DD67-4220-9EC3-42A8ACBDC4F5}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{B9324A38-DD67-4220-9EC3-42A8ACBDC4F5}.Debug|x64.ActiveCfg = Debug|x64
+		{B9324A38-DD67-4220-9EC3-42A8ACBDC4F5}.Debug|x64.Build.0 = Debug|x64
+		{B9324A38-DD67-4220-9EC3-42A8ACBDC4F5}.Release|x64.ActiveCfg = Release|x64
+		{B9324A38-DD67-4220-9EC3-42A8ACBDC4F5}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {AF287516-09DE-4A70-AF44-3C4F5D850105}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/triangular_loop.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/triangular_loop.vcxproj
new file mode 100755
index 0000000000..6d5fc1777b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/DesignPatterns/triangular_loop/triangular_loop.vcxproj
@@ -0,0 +1,160 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\triangular_loop.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{b9324a38-dd67-4220-9ec3-42a8acbdc4f5}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>triangular_loop</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)triagular_loop.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)triagular_loop.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/CMakeLists.txt
new file mode 100755
index 0000000000..125d32c072
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(MemoryAttributesOverview)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/README.md
new file mode 100755
index 0000000000..799c4bcf8b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/README.md
@@ -0,0 +1,182 @@
+
+# Avoiding Aliasing of Kernel Arguments
+This tutorial explains the  `kernel_args_restrict` attribute and its effect on the performance of FPGA kernels.
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. 
+
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               |  The problem of *pointer aliasing* and its impact on compiler optimizations <br> The behavior of the `kernel_args_restrict` attribute and when to use it on your kernel <br> The effect this attribute can have on your kernel's performance on FPGA
+| Time to complete                  | 20 minutes
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+Due to pointer aliasing, the compiler must be conservative about optimizations that reorder, parallelize or overlap operations that could alias. This tutorial demonstrates the use of the DPC++ `[[intel::kernel_args_restrict]]` kernel attribute, which should be applied any time you can guarantee that kernel arguments do not alias. This attribute enables more aggressive compiler optimizations and often improves kernel performance on FPGA.
+
+
+### What Is Pointer Aliasing?
+Pointer aliasing occurs when the same memory location can be accessed using different *names* (i.e. variables). For example, consider the code below. Here, the variable `pi` can be changed one of three ways: `pi=3.14159`, `*a=3.14159` or `*b=3.14159`. In general, the compiler has to be conservative about which accesses may alias to each other and avoid making optimizations that reorder and/or parallelize operations.
+
+```c++
+float pi = 3.14;
+float *a = &pi;
+float *b = a;
+```
+### Pointer Aliasing of Arguments
+Consider the function illustrated below. Though the intention of the code is clear to the reader, the compiler cannot guarantee that `in` does not alias with `out`. Imagine a degenerate case where the function was called: like this `myCopy(ptr, ptr+1, 10)`. This would cause `in[i]` and `out[i+1]` to alias to the same address, for all `i` from 0 to 9. 
+```c++
+void myCopy(int *in, int *out, size_t int size) {
+  for(size_t int i = 0; i < size; i++) {
+    out[i] = in[i];
+  }
+}
+```
+This possibility of aliasing forces the compiler to be conservative. Without more information from the developer, it cannot make any optimizations which overlap, vectorize or reorder the assignment operations. Doing so would result in functionally incorrect behavior if the compiled function is called with aliasing pointers.
+
+If this code is compiled to FPGA, the performance penalty of this conservatism is severe. The loop in `myCopy` cannot be pipelined, because the next iteration of the loop cannot begin until the current iteration has completed. 
+
+### A Promise to the Compiler
+The developer often knows that pointer arguments will never alias in practice, as with the `myCopy` function. In your DPC++ program, you can use the `[[intel::kernel_args_restrict]]` attribute to inform the compiler that none of a kernel's arguments will alias to any another, thereby enabling more aggressive optimizations. If the non-aliasing assumption is violated at runtime, the result will be undefined behavior.
+
+C and OpenCL programmers may recognize this concept as the `restrict` keyword.
+
+### Tutorial Code Description
+In this tutorial, we will show how to use the `kernel_args_restrict` attribute for your kernel and the effect it has on performance. We show two kernels that perform the same function; one with the `[[intel::kernel_args_restrict]]` applied to it and the other without. The function of the kernel is simple: copy the contents of one buffer to another. We will analyze the effect of the `[[intel::kernel_args_restrict]]` attribute on the performance of the kernel by analyzing the loop II in the reports and the latency of the kernel on actual hardware.
+
+## Key Concepts
+* The problem of *pointer aliasing* and its impact on compiler optimizations
+* The behavior of the `kernel_args_restrict` attribute and when to use it on your kernel
+* The effect this attribute can have on your kernel's performance on FPGA
+
+## License  
+This code sample is licensed under MIT license.
+
+## Building the `kernel_args_restrict` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the optimization report: 
+     ```
+     make report
+     ``` 
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/kernel_args_restrict.fpga.tar.gz" download>here</a>.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. 
+
+1. Enter the source file directory.
+   ```
+   cd src
+   ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      ninja fpga_emu
+      ```
+
+   * Generate the optimization report:
+
+     ```
+     ninja report
+     ```
+     If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+     ```
+     ninja report_s10_pac
+     ```     
+   * Compiling for FPGA hardware is not yet supported on Windows.
+ 
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Examining the Reports
+Locate `report.html` in the `kernel_args_restrict_report.prj/reports/` or `kernel_args_restrict_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+Navigate to the *Loop Analysis* report (*Throughput Analysis* > *Loop Analysis*). In the *Loop List pane* you should see two kernels: one is the kernel without the attribute applied (*KernelArgsNoRestrict*) and the other with the attribute applied (*KernelArgsRestrict*). Each kernel each has a single for-loop, which appears in the *Loop List* pane. Click on the loop under each kernel to see how it was optimized by the compiler.
+
+Compare the loop initiation interval (II) between the two kernels. Notice that the loop in the *KernelArgsNoRestrict* kernel has a large estimated II, while the loop in the *KernelArgsRestrict* kernel has an estimated II of ~1. These IIs are estimates because the latency of global memory accesses vary with runtime conditions.
+
+For the *KernelArgsNoRestrict* kernel, the compiler assumed that the kernel arguments can alias each other. Since`out[i]` and `in[i+1]` could be the same memory location, the compiler cannot overlap the iteration of the loop performing `out[i] = in[i]` with the next iteration of the loop performing `out[i+1] = in[i+1]` (and likewise for iterations `in[i+2]`, `in[i+3]`, ...). This results in an II equal to the latency of the global memory read of `in[i]` plus the latency of the global memory write to `out[i]`.
+
+We can confirm this by looking at the details of the loop. Click on the *KernelArgsNoRestrict* kernel in the *Loop List* pane and then click on the loop in the *Loop Analysis* pane. Now consider the *Details* pane below. You should see something like:
+
+- *Compiler failed to schedule this loop with smaller II due to memory dependency*
+  - *From: Load Operation (kernel_args_restrict.cpp: 74 > accessor.hpp: 945)*
+  - *To: Store Operation (kernel_args_restrict.cpp: 74)*
+- *Most critical loop feedback path during scheduling:*
+  - *144.00 clock cycles Load Operation (kernel_args_restrict.cpp: 74 > accessor.hpp: 945)*
+  - *42.00 clock cycles Store Operation (kernel_args_restrict.cpp: 74)*
+
+The first bullet (and its sub-bullets) tell you that there is memory dependency between the load and store operations in the loop. This is the conservative pointer aliasing memory dependency described earlier. The second bullet shows you the estimated latencies for the load and store operations (note that these are board-dependent). The sum of these two latencies (plus 1) is the II of the loop.
+
+Next, look at the loop details of the *KernelArgsRestrict* kernel. You will notice that the *Details* pane doesn't show a memory dependency. The usage of the `[[intel::kernel_args_restrict]]` attribute allowed the compiler to schedule a new iteration of the for-loop every cycle since it knows that accesses to `in` and `out` will never alias.
+
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./kernel_args_restrict.fpga_emu     (Linux)
+     kernel_args_restrict.fpga_emu.exe   (Windows)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./kernel_args_restrict.fpga         (Linux)
+     ```
+
+### Example of Output
+```
+Kernel throughput without attribute: 8.06761 MB/s
+Kernel throughput with attribute: 766.873 MB/s
+PASSED
+```
+
+### Discussion of Results
+
+The throughput observed when running the kernels with and without the `kernel_args_restrict` attribute should reflect the difference in loop II seen in the reports. The ratios will not exactly match because the loop IIs are estimates. An example ratio (compiled and run on the Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA) is shown.
+
+Attribute used?  | II | Kernel Throughput (MB/s)
+------------- | ------------- | --------
+No  | ~187 | 8
+Yes  | ~1 | 767
+
+Note that this performance difference will be apparent only when running on FPGA hardware. The emulator, while useful for verifying functionality, will generally not reflect differences in performance.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/kernel_args_restrict.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/kernel_args_restrict.sln
new file mode 100755
index 0000000000..7fd1d9a291
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/kernel_args_restrict.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "kernel_args_restrict", "kernel_args_restrict.vcxproj", "{D6A634E7-9F2B-46C2-A21C-2402F631A55A}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{D6A634E7-9F2B-46C2-A21C-2402F631A55A}.Debug|x64.ActiveCfg = Debug|x64
+		{D6A634E7-9F2B-46C2-A21C-2402F631A55A}.Debug|x64.Build.0 = Debug|x64
+		{D6A634E7-9F2B-46C2-A21C-2402F631A55A}.Release|x64.ActiveCfg = Release|x64
+		{D6A634E7-9F2B-46C2-A21C-2402F631A55A}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {4AC13DD2-5B0F-4051-93BF-85AEAF6E50C9}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/kernel_args_restrict.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/kernel_args_restrict.vcxproj
new file mode 100755
index 0000000000..7b0b629cf5
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/kernel_args_restrict.vcxproj
@@ -0,0 +1,155 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\kernel_args_restrict.cpp" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{d6a634e7-9f2b-46c2-a21c-2402f631a55a}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>kernel_args_restricts</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/sample.json
new file mode 100755
index 0000000000..45a85e30a9
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/sample.json
@@ -0,0 +1,51 @@
+{
+  "guid": "86066897-498B-41C5-BFA3-A03D3CAE2503",
+  "name": "Avoiding Aliasing of Kernel Arguments",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "Explain the `kernel_args_restrict` attribute and its effect on the performance of FPGA kernels.",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "targetDevice": ["FPGA"],
+  "builder": ["ide", "cmake"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./kernel_args_restrict.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "kernel_args_restrict.fpga_emu.exe"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/CMakeLists.txt
new file mode 100755
index 0000000000..0a4f13cefc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/CMakeLists.txt
@@ -0,0 +1,94 @@
+set(SOURCE_FILE kernel_args_restrict.cpp)
+set(TARGET_NAME kernel_args_restrict)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga")
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+# fpga emulator
+if(WIN32)
+    set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+    add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+    separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+    add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+             COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+             DEPENDS ${SOURCE_FILE})
+
+else()
+    add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+    add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpga
+if(WIN32)
+    add_custom_target(fpga
+            COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+
+    set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS "${HARDWARE_COMPILE_FLAGS}")
+    set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS})
+endif()
+
+# report
+if(WIN32)
+    set(REPORT ${TARGET_NAME}_report.a)
+
+    add_custom_target(report DEPENDS ${REPORT})
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${CMAKE_BINARY_DIR}/${TARGET_NAME}/${SOURCE_FILE} COPYONLY)
+
+    add_custom_command(OUTPUT ${REPORT}
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_BINARY_DIR}/${TARGET_NAME}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORT}
+                 DEPENDS ${SOURCE_FILE})
+
+else()
+    set(REPORT ${TARGET_NAME}_report.a)
+
+    add_custom_target(report DEPENDS ${REPORT})
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${REPORT}
+                 COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORT}
+                 DEPENDS ${SOURCE_FILE})
+endif()
+
+# run
+add_custom_target(run
+            COMMAND ../${TARGET_NAME}.fpga_emu
+            DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/build.ninja
new file mode 100755
index 0000000000..5213ba0f55
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/build.ninja
@@ -0,0 +1,30 @@
+source_file = kernel_args_restrict.cpp
+target_name = kernel_args_restrict
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+  command = dpcpp /GX ${emulator_flags} $in -o $out
+
+rule gen_report
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out
+
+# FPGA emulator 
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file} 
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/kernel_args_restrict.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/kernel_args_restrict.cpp
new file mode 100755
index 0000000000..550f122ece
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/kernel_args_restrict/src/kernel_args_restrict.cpp
@@ -0,0 +1,134 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <vector>
+#include "dpc_common.hpp"
+
+using namespace sycl;
+
+// problem input size
+constexpr size_t kInSize = 1000000;
+constexpr double kInputMB = (kInSize * sizeof(int)) / (1024 * 1024);
+constexpr int kRandMax = 7777;
+
+// Forward declare the kernel names
+// (This will become unnecessary in a future compiler version.)
+class KernelArgsRestrict;
+class KernelArgsNoRestrict;
+
+// Return the execution time of the event, in seconds
+double GetExecutionTime(const event &e) {
+  double start_k = e.get_profiling_info<info::event_profiling::command_start>();
+  double end_k = e.get_profiling_info<info::event_profiling::command_end>();
+  double kernel_time = (end_k - start_k) * 1e-9; // ns to s
+  return kernel_time;
+}
+
+void RunKernels(size_t size, std::vector<int> &in, std::vector<int> &nr_out,
+                std::vector<int> &r_out) {
+
+#if defined(FPGA_EMULATOR)
+  intel::fpga_emulator_selector device_selector;
+#else
+  intel::fpga_selector device_selector;
+#endif
+
+  try {
+    // create the SYCL device queue
+    queue q(device_selector, dpc_common::exception_handler,
+            property::queue::enable_profiling{});
+
+    buffer in_buf(in);
+    // Use verbose SYCL 1.2 syntax for the output buffer.
+    // (This will become unnecessary in a future compiler version.)
+    buffer<int, 1> nr_out_buf(nr_out.data(), size);
+    buffer<int, 1> r_out_buf(r_out.data(), size);
+
+    // submit the task that DOES NOT apply the kernel_args_restrict attribute
+    auto e_nr = q.submit([&](handler &h) {
+      auto in_acc = in_buf.get_access<access::mode::read>(h);
+      auto out_acc = nr_out_buf.get_access<access::mode::discard_write>(h);
+
+      h.single_task<KernelArgsNoRestrict>([=]() {
+        for (size_t i = 0; i < size; i++) {
+          out_acc[i] = in_acc[i];
+        }
+      });
+    });
+
+    // submit the task that DOES apply the kernel_args_restrict attribute
+    auto e_r = q.submit([&](handler &h) {
+      auto in_acc = in_buf.get_access<access::mode::read>(h);
+      auto out_acc = r_out_buf.get_access<access::mode::discard_write>(h);
+
+      h.single_task<KernelArgsRestrict>([=]() [[intel::kernel_args_restrict]] {
+        for (size_t i = 0; i < size; i++) {
+          out_acc[i] = in_acc[i];
+        }
+      });
+    });
+
+    // measure the execution time of each kernel 
+    double nr_time = GetExecutionTime(e_nr);
+    double r_time = GetExecutionTime(e_r);
+
+    std::cout << "Kernel throughput without attribute: " << (kInputMB / nr_time)
+              << " MB/s\n";
+    std::cout << "Kernel throughput with attribute: " << (kInputMB / r_time)
+              << " MB/s\n";
+
+  } catch (sycl::exception const &e) {
+    // Catches exceptions in the host code
+    std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+
+    // Most likely the runtime couldn't find FPGA hardware!
+    if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+      std::cout << "If you are targeting an FPGA, please ensure that your "
+                   "system has a correctly configured FPGA board.\n";
+      std::cout << "If you are targeting the FPGA emulator, compile with "
+                   "-DFPGA_EMULATOR.\n";
+    }
+    std::terminate();
+  }
+}
+
+int main() {
+  // seed the random number generator
+  srand(0);
+
+  // input/output data
+  std::vector<int> in(kInSize);
+  std::vector<int> nr_out(kInSize), r_out(kInSize);
+
+  // generate some random input data
+  for (size_t i = 0; i < kInSize; i++) {
+    in[i] = rand() % kRandMax;
+  }
+
+  // Run the kernels
+  RunKernels(kInSize, in, nr_out, r_out);
+
+  // validate the results
+  for (size_t i = 0; i < kInSize; i++) {
+    if (in[i] != nr_out[i]) {
+      std::cout << "FAILED: mismatch at entry " << i
+                << " of 'KernelArgsNoRestrict' kernel output\n";
+      return 1;
+    }
+  }
+  for (size_t i = 0; i < kInSize; i++) {
+    if (in[i] != r_out[i]) {
+      std::cout << "FAILED: mismatch at entry " << i
+                << " of 'KernelArgsRestrict' kernel output\n";
+      return 1;
+    }
+  }
+
+  std::cout << "PASSED\n";
+
+  return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/CMakeLists.txt
new file mode 100755
index 0000000000..8ab3aa3653
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(LoopCoalesce)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/README.md
new file mode 100755
index 0000000000..4b2530a96a
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/README.md
@@ -0,0 +1,167 @@
+
+# Coalescing Nested Loops
+This FPGA tutorial demonstrates applying the `loop_coalesce` attribute to a nested loop in a task kernel to reduce the area overhead.
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. 
+
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               |  What the `loop_coalesce` attribute does <br> How `loop_coalesce` attribute affects resource usage and loop throughput <br> How to apply the `loop_coalesce` attribute to loops in your program <br> Which loops make good candidates for coalescing
+| Time to complete                  | 15 minutes
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+The `loop_coalesce` attribute enables you to direct the compiler to combine nested loops into a single loop. The attribute `[[intelfpga::loop_coalesce(N)]]` takes an integer argument `N`, that specifies how many nested loop levels that you want the compiler to attempt to coalesce.
+
+**NOTE**: If you specify `[[intelfpga::loop_coalesce(1)]]` on nested loops, the compiler does not attempt to coalesce any of the nested loops.
+### Example: Coalescing Two Loops 
+
+```
+[[intelfpga::loop_coalesce(2)]] 
+for (int i = 0; i < N; i++)
+  for (int j = 0; j < M; j++)
+    sum[i][j] += i+j;  
+```
+The compiler coalesces the two loops together so that they execute as if they were a single loop written as follows:
+
+```
+int i = 0;
+int j = 0;
+while(i < N){
+  sum[i][j] += i+j;
+  j++;
+  if (j == M){
+    j = 0;
+    i++;
+  }
+}
+```
+
+### Identifying Which Loops to Coalesce
+Generally, coalescing loops can help reduce area usage by reducing the overhead needed for loop control. However, in some circumstances, coalescing loops can reduce kernel throughput. Scenarios where the `loop_coalesce` attribute can be applied to save area without a loss of throughput are those where: 
+
+   1. The loops being coalesced have the same initiation interval (II).
+   2. The exit condition computation for the resulting coalesced look is not complicated.
+
+If the innermost coalesced loop has a very small trip count, `loop_coalesce` might actually improve throughput.
+
+
+## Key Concepts
+* Description of the `loop_coalesce` attribute 
+* How `loop_coalesce` attribute affects resource usage and loop throughput 
+* How to apply the `loop_coalesce` attribute to loops in your program 
+* Determining which loops make good candidates for coalescing
+
+## License  
+This code sample is licensed under MIT license.
+
+
+## Building the `loop_coalesce` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the optimization report: 
+     ```
+     make report
+     ``` 
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/loop_coalesce.fpga.tar.gz" download>here</a>.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. 
+
+1. Enter the source file directory.
+   ```
+   cd src
+   ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      ninja fpga_emu
+      ```
+
+   * Generate the optimization report:
+
+     ```
+     ninja report
+     ```
+     If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+     ```
+     ninja report_s10_pac
+     ```     
+   * Compiling for FPGA hardware is not yet supported on Windows.
+ 
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Examining the Reports
+Locate `report.html` in the `loop_coalesce_report.prj/reports/` or `loop_coalesce_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+On the main report page, scroll down to the section titled `Compile Estimated Kernel Resource Utilization Summary`. Each kernel name ends in the loop_coalesce attribute argument used for that kernel, e.g., KernelCompute<2> uses a loop_coalesce argument of 2. You can verify that the number of registers, MLABs and DSPs used for each kernel decreases after nested loops are coalesced.
+
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./loop_coalesce.fpga_emu     (Linux)
+     loop_coalesce.fpga_emu.exe   (Windows)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./loop_coalesce.fpga         (Linux)
+     ```
+
+### Example of Output
+
+```
+Loop Coalesce: 1 -- kernel time : 156 microseconds
+Throughput for kernel with coalesce_factor 1: 6550KB/S
+Loop Coalesce: 2 -- kernel time : 113 microseconds
+Throughput for kernel with coalesce_factor 2: 9064KB/S
+PASSED: The results are correct
+
+```
+
+### Discussion of Results
+The execution time and throughput for each kernel is displayed. Applying the `loop_coalesce` attribute in this example reduced the kernel execution time by a factor of ~1.5. Note that you will only see this result when executing on FPGA hardware. The emulator will generally not reflect performance differences.
+     
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/loop_coalesce.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/loop_coalesce.sln
new file mode 100755
index 0000000000..ba59611875
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/loop_coalesce.sln
@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "loop_coalesce", "loop_coalesce.vcxproj", "{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Debug|x64.ActiveCfg = Debug|x64
+		{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Debug|x64.Build.0 = Debug|x64
+		{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Release|x64.ActiveCfg = Release|x64
+		{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {92BEFAAB-0365-4E5A-9C4A-E50AB49B2A6B}
+	EndGlobalSection
+EndGlobal
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/loop_coalesce.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/loop_coalesce.vcxproj
new file mode 100755
index 0000000000..ee6a1746e4
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/loop_coalesce.vcxproj
@@ -0,0 +1,161 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\loop_coalesce.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{cf6a576b-665d-4f24-bb62-0dae7a7b3c64}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>loop_coalesce</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0.17763.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)loop_coalesce.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+    <Manifest />
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)loop_coalesce.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/sample.json
new file mode 100755
index 0000000000..c43debe7c9
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/sample.json
@@ -0,0 +1,51 @@
+{
+  "guid": "370A5B2B-EBB3-4E7F-89F3-73D333522215",
+  "name": "Coalescing Nested Loops",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "FPGA tutorial demonstrating the loop_coalesce attribute",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "targetDevice": ["FPGA"],
+  "builder": ["ide", "cmake"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./loop_coalesce.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "loop_coalesce.fpga_emu.exe"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/CMakeLists.txt
new file mode 100755
index 0000000000..bf71de4094
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/CMakeLists.txt
@@ -0,0 +1,88 @@
+set(SOURCE_FILE loop_coalesce.cpp)
+set(TARGET_NAME loop_coalesce)
+
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga")
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+# fpga emulator
+if(WIN32)
+    set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+    add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+    separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+    add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+                   COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+                   DEPENDS ${SOURCE_FILE})
+else()
+    add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+    add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpga
+if(WIN32)
+    add_custom_target(fpga
+                  COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+    set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS})
+    set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS})
+endif()
+
+# generate report
+if(WIN32)
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                   DEPENDS ${SOURCE_FILE})
+
+else()
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                   DEPENDS ${SOURCE_FILE})
+endif()
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}.fpga_emu
+                  DEPENDS ${TARGET_NAME}.fpga_emu)
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/build.ninja
new file mode 100755
index 0000000000..edc74950ec
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/build.ninja
@@ -0,0 +1,30 @@
+source_file = loop_coalesce.cpp
+target_name = loop_coalesce
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+  command = dpcpp /GX ${emulator_flags} $in -o $out
+
+rule gen_report
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/loop_coalesce.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/loop_coalesce.cpp
new file mode 100755
index 0000000000..a779bec4b1
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_coalesce/src/loop_coalesce.cpp
@@ -0,0 +1,176 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <iomanip>
+#include <iostream>
+#include "dpc_common.hpp"
+
+using namespace sycl;
+
+// Matrix dimensions
+constexpr size_t kNumRows = 4;
+constexpr size_t kNumCols = 4;
+constexpr size_t kNumElements = kNumRows * kNumCols;
+
+// Total floating point ops performed by the kernel
+constexpr size_t kTotalOps = (4 + (3*kNumCols)) * kNumElements;
+
+
+// Forward declare the kernel name
+// (This will become unnecessary in a future compiler version.)
+template <int N> class KernelCompute;
+
+// The kernel implements a matrix multiplication.
+// This is not meant to be a high performance implementation on FPGA! 
+// It's just a simple kernel with nested loops to illustrate loop coalescing.
+template <int coalesce_factor>
+void MatrixMultiply(const device_selector &selector,
+                    const std::vector<float> &matrix_a,
+                    const std::vector<float> &matrix_b,
+                    std::vector<float> &res) {
+  double kernel_time = 0.0;
+  try {
+    auto prop_list = property_list{property::queue::enable_profiling()};
+
+    queue q(selector, dpc_common::exception_handler, prop_list);
+
+    buffer buffer_in_a(matrix_a);
+    buffer buffer_in_b(matrix_b);
+    // Use verbose SYCL 1.2 syntax for the output buffer.
+    // (This will become unnecessary in a future compiler version.)
+    buffer<float, 1> buffer_out(res.data(), kNumElements);
+
+    event e = q.submit([&](handler &h) {
+      auto accessor_matrix_a = buffer_in_a.get_access<access::mode::read>(h);
+      auto accessor_matrix_b = buffer_in_b.get_access<access::mode::read>(h);
+      auto accessor_res = buffer_out.get_access<access::mode::discard_write>(h);
+
+      // The kernel_args_restrict promises the compiler that this kernel's 
+      // accessor arguments won't alias (i.e. non-overlapping memory regions).
+      h.single_task<class KernelCompute<coalesce_factor>>(
+                                       [=]() [[intel::kernel_args_restrict]] {
+        size_t idx = 0;
+        float a[kNumRows][kNumCols];
+        float b[kNumRows][kNumCols];
+        float tmp[kNumRows][kNumCols];
+
+        // The loop_coalesce instructs the compiler to attempt to "merge" 
+        // coalesce_factor loop levels of this nested loop together.
+        // For example, a coalesce_factor of 2 turns this into a single loop.
+        [[intelfpga::loop_coalesce(coalesce_factor)]]
+        for (size_t i = 0; i < kNumRows; ++i) {
+          for (size_t j = 0; j < kNumCols; ++j) {
+            a[i][j] = accessor_matrix_a[idx];
+            b[i][j] = accessor_matrix_b[idx];
+            tmp[i][j] = 0.0;
+            idx++;
+          }
+        }
+        
+        // Applying loop_coalesce to the outermost loop of a deeply nested
+        // loop results coalescing from the outside in.
+        // For example, a coalesce_factor of 2 coalesces the "i" and "j" loops,
+        // making a doubly nested loop.   
+        [[intelfpga::loop_coalesce(coalesce_factor)]]
+        for (size_t i = 0; i < kNumRows; ++i) {
+          for (size_t j = 0; j < kNumCols; ++j) {
+            float sum = 0.0f;
+            for (size_t k = 0; k < kNumCols; ++k) {
+              sum += a[i][k] * b[k][j];
+            }
+            tmp[i][j] = sum;
+          }
+        }
+
+        idx = 0;
+        [[intelfpga::loop_coalesce(coalesce_factor)]]
+        for (size_t i = 0; i < kNumRows; ++i) {
+          for (size_t j = 0; j < kNumCols; ++j) {
+            accessor_res[idx] = tmp[i][j];
+            idx++;
+          }
+        }
+
+      });
+    });
+
+    // Kernel profiling data
+    double start = e.get_profiling_info<info::event_profiling::command_start>();
+    double end = e.get_profiling_info<info::event_profiling::command_end>();
+    // convert nanoseconds to microseconds
+    kernel_time = (double)(end - start) * 1e-3;
+
+  } catch (exception const &exc) {
+    std::cout << "Caught synchronous SYCL exception:\n" << exc.what() << '\n';
+    if (exc.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+      std::cout << "If you are targeting an FPGA, please ensure that your "
+                   "system has a correctly configured FPGA board.\n";
+      std::cout << "If you are targeting the FPGA emulator, compile with "
+                   "-DFPGA_EMULATOR.\n";
+    }
+    std::terminate();
+  }
+
+  std::cout << "Loop Coalesce: " << coalesce_factor
+            << " -- kernel time : " << kernel_time << " microseconds\n";
+  std::cout << "Throughput for kernel with coalesce_factor " << coalesce_factor
+            << ": ";
+  std::cout << std::fixed << std::setprecision(0)
+            << (((double)kTotalOps * sizeof(float) * 1e-3f) /
+                (kernel_time * 1e-6f)) << "KB/s\n";
+}
+
+int main() {
+  std::vector<float> matrix_a(kNumElements);
+  std::vector<float> matrix_b(kNumElements);
+  std::vector<float> matrix_output_no_col(kNumElements);
+  std::vector<float> matrix_output(kNumElements);
+
+  // Specify the matrices to be multiplied
+  for (size_t i = 0; i < kNumRows; i++) {
+    size_t pos = i * kNumCols;
+    // Initialize A as identity matrix
+    matrix_a[i + pos] = 1.0;
+    for (size_t j = 0; j < kNumCols; j++) {
+      matrix_b[pos + j] = i * j + 1;
+    }
+  }
+
+#if defined(FPGA_EMULATOR)
+  intel::fpga_emulator_selector selector;
+#else
+  intel::fpga_selector selector;
+#endif
+
+  // Two versions of the simple matrix multiply kernel will be enqueued:
+  //  - with coalesce_factor=1 (i.e. no loop coalescing)
+  //  - with coalesce_factor=2 (coalesce two nested levels)
+  MatrixMultiply<1>(selector, matrix_a, matrix_b, matrix_output_no_col);
+  MatrixMultiply<2>(selector, matrix_a, matrix_b, matrix_output);
+
+  // Correctness check
+  bool passed = true;
+  for (size_t i = 0; i < kNumRows; i++) {
+    size_t pos = i * kNumCols;
+    for (size_t j = 0; j < kNumCols; j++) {
+      float val_noCol = matrix_output_no_col[pos + j];
+      float val = matrix_output[pos + j];
+      if (val_noCol != i * j + 1 || val != i * j + 1) {
+        std::cout << "FAILED: The results are incorrect\n";
+        passed = false;
+      }
+    }
+  }
+
+  if (passed) {
+    std::cout << "PASSED: The results are correct\n";
+    return 0;
+  } else {
+    std::cout << "FAILED\n";
+    return -1;
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/CMakeLists.txt
new file mode 100755
index 0000000000..3805253a8e
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(LoopIvdep)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/README.md
new file mode 100755
index 0000000000..2cd79d752d
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/README.md
@@ -0,0 +1,251 @@
+
+
+# Loop `ivdep` Attribute
+This FPGA tutorial demonstrates how to applying the `ivdep` attribute to a loop to aid the compiler's loop dependence analysis.
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. 
+
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               |  Basics of loop-carried dependencies <br> The notion of a loop-carried dependence distance <br> What constitutes a *safe* dependence distance <br> How to aid the compiler's dependence analysis to maximize performance
+| Time to complete                  | 30 minutes
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+In order to understand and apply `ivdep` to loops in your design, it is necessary to understand the concepts of loop-carried memory dependencies. Unlike many other attributes that can be used to improve a design's performance, `ivdep` has functional implications. Using it incorrectly will result in undefined behavior for your design!
+
+### Loop-Carried Memory Dependencies
+A *loop-carried memory dependency* refers to a situation where memory access in a given loop  iteration cannot proceed until a memory access from a previous loop iteration is completed. Loop-carried dependencies can be categorized into the following cases:
+* **True-dependence (Read-After-Write)** - A memory location read in an iteration that must occur after a previous iteration writes to the same memory location.
+* **Anti-dependence (Write-After-Read)** - A memory location read must occur before a future iteration writes to the same memory location.
+* **Output-dependence (Write-After-Write)** - A memory location write must occur before a future iteration writes to the same memory location.
+
+The Intel® oneAPI DPC++ Compiler (Beta) employs static analysis to scan the program's code to establish the dependence relationships between all memory accesses in a loop. However, depending on the complexity of the addressing expressions and the loop's stride or upper bound, the compiler may not be able to statically determine precise dependence information. 
+
+In such scenarios, the compiler must conservatively assume some statements to be dependent in order to guarantee functional correctness of the generated hardware. Precise dependence information is crucially important to generate an efficient pipelined datapath. Such information reduces the number of assumed dependencies, allowing the hardware schedule to extract as much pipeline parallelism from loops as possible.
+
+#### Example 1: Basic true-dependence
+Each iteration of the loop reads a value from memory location that is written to in the previous iteration. The pipelined datapath generated by the compiler cannot issue a new iteration until the previous iteration is complete.
+
+```c++
+for(i = 1; i < n; i++){
+  S: a[i] = a[i-1];
+}
+```
+
+#### Example 2: Complex or statically-unknown indexing expression
+The compiler cannot statically infer the true access pattern for the loads from array `a`. To guarantee functional correctness, the compiler must conservatively assume the statements in the loop to be dependent across all iterations. The resulting generated datapath issues new iterations, similar to the example 1, executing one iteration at a time.
+```c++
+for(i = 0; i < n; i++){
+  S: a[i] = a[b[i]];
+}
+```
+
+#### Example 3: Loop-independent dependence
+Some memory dependencies in program code do not span multiple iterations of a loop. In the following example code, dependencies from statement `S2` on `S1` and from statement `S3` on `S1` are referred to as loop-independent memory dependencies. Such dependencies do not prevent the compiler from generating an efficient pipelined loop datapath and are not considered in this tutorial.
+```c++
+for(i = 0; i < n; i++){
+  S1: a[i] = foo();
+  ...
+  S2: b[i] = a[i];
+}
+for(j = 0; j < m; j++){
+  S3: A[i] = bar();
+}
+```
+
+### Loop-carried dependence distance
+Imagine loop-carried dependencies in terms of the distance between the dependence source and sink statements, measured in the number of iterations of the loop containing the statements. In example 1, the dependence source (store into array `a`) and dependence sink (load from the same index in array `a`) are one iteration apart. That is, for the specified memory location, the data is read one iteration after it was written. Therefore, this true dependence has a distance of 1. In many cases, the compiler loop dependence analysis may be able to statically determine the dependence distance. 
+
+#### Example 4: Simple dependence distance
+The compiler's static analysis facilities can infer that the distance of the true dependence in the following example code is 10 iterations. This has an impact on the scheduling of how iterations of the loop are issued into the generated pipelined datapath. For example, iteration `k` may not begin executing the load from array `a` before iteration `(k-10)` has completed storing the data into the same memory location. However, iterations `[k-9,k)` do not incur the scheduling constraint on the store in iteration `(k-10)` and begin execution earlier.
+```c++
+for(i = 1; i < n; i++){
+  S: a[i] = a[i-10];
+}
+```
+
+#### Example 5: Dependence distance across multiple loops in a nest
+Statement `S`, in the code snippet that follows, forms two distinct true dependencies, one carried by loop `L1` and one by loop `L2`. Across iterations of loop `L1`, data is stored into a location in array `a` that is read in the next iteration. Similarly, across iterations of loop `L2`, data is stored into a location in array `a` that is read in a later iteration. In the latter case, the dependence across loop `L2` has dependence distance of 2. In the former, the dependence distance across loop `L1` has dependence distance of 1. Special care must be taken when reasoning about loop-carried memory dependencies spanning multiple loops.
+```c++
+L1: for(i = 1; i < n; i++){
+  L2: for(j = 1; j < m; j++){
+        S: a[i][j] = a[i-1][j-2];
+  }
+}
+```
+
+### Specifying that memory accesses do *not* cause loop-carried dependencies
+Apply the `ivdep` attribute to a loop to inform the compiler that ***none*** of the memory accesses within a loop incur loop-carried dependencies.
+```c++
+[[intelfpga::ivdep]]
+for (int i = 0; i < N; i++) {
+    A[i] = A[i - X[i]];
+}
+```
+The `ivdep` attribute indicates to the compiler that it can disregard assumed loop-carried  memory dependencies and generate a pipelined datapath for this loop capable of issuing new iterations as soon as possible (every cycle), maximizing possible throughput.
+
+### Specifying that memory accesses do *not* cause loop-carried dependencies across a fixed distance
+Apply the `ivdep` attribute with a `safelen` parameter to set a specific lower bound on the dependence distance that can possibly be attributed to loop-carried dependencies in the associated loop.
+```c++
+// n is a constant expression of integer type
+[[intelfpga::ivdep(n)]]
+for (int i = 0; i < N; i++) {
+    A[i] = A[i - X[i]];
+}
+```
+The `ivdep` attribute informs the compiler to generate a pipelined loop datapath that can issue a new iteration as soon as the iteration `n` iterations ago has completed. The attribute parameter (`safelen`) is a refinement of the compiler static loop-carried dependence analysis that infers the dependence present in the code but is otherwise unable to accurately determine its distance.
+
+***IMPORTANT***: Applying the `ivdep` attribute or the `ivdep` attribute with a `safelen` parameter may lead to incorrect results if the annotated loop exhibits loop-carried memory dependencies. The attribute directs the compiler to generate hardware assuming no loop-carried dependencies. Specifying this assumption incorrectly is an invalid use of the attribute, and results in undefined (and likely incorrect) behavior.
+
+### Testing the Tutorial
+In `loop_ivdep.cpp`, the `ivdep` attribute is applied to the kernel work loop with a `safelen` parameter of 1 and 128.
+```c++
+  TransposeAndFold<kMinSafelen>(selector,  A,  B); // kMinSafelen = 1
+  TransposeAndFold<kMaxSafelen>(selector,  A,  C); // kMaxSafelen = 128
+```
+The `ivdep` attribute with `safelen` parameter equal to 1 informs the compiler that iterations of the associated loop do not form a loop-carried memory dependence with a distance of at least 1. That is, the attribute is redundant and is equivalent to the code without the attribute in place.
+
+**_Try this!_**: Compile the tutorial program in `loop_ivdep.cpp` with and without the `[[intelfpga::ivdep]]` attribute altogether and compare the resulting reports.
+
+The `ivdep` attribute with `safelen` parameter equal to 128 is reflective of the maximum number of iterations of the associated loop among which no loop-carried memory dependence occurs. The annotated loop nest contains a dependence on values of array `temp_buffer`:
+
+```c++
+for (size_t j = 0; j < kMatrixSize * kRowLength; j++) {
+  for (size_t i = 0; i < kRowLength; i++) {
+    temp_buffer[j % kRowLength][i] += in_buffer[i][j % kRowLength];
+  }
+}
+```
+Observe that the indexing expression on `temp_buffer` evaluates to the same index every `kRowLength` iterations of the `j` loop. Specifying the `ivdep` attribute on the `j` loop without a `safelen` parameter, or with a `safelen` parameter >= `kRowLength` leads to undefined behavior because the generated hardware does not adhere to the ordering constraint imposed by the dependence. Specifying the `ivdep` attribute with a `safelen` attribute <= `kRowLength` is valid and will result in a better performing end result.
+
+## Key Concepts
+* Basics of loop-carried dependencies
+* The notion of a loop-carried dependence distance
+* Determining what constitutes a *safe* dependence distance
+* How to aid the compiler's dependence analysis to maximize performance
+
+## License  
+This code sample is licensed under MIT license.
+
+
+## Building the `loop_ivdep` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the optimization report: 
+     ```
+     make report
+     ``` 
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/loop_ivdep.fpga.tar.gz" download>here</a>.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. 
+
+1. Enter the source file directory.
+   ```
+   cd src
+   ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      ninja fpga_emu
+      ```
+
+   * Generate the optimization report:
+
+     ```
+     ninja report
+     ```
+     If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+     ```
+     ninja report_s10_pac
+     ```     
+   * Compiling for FPGA hardware is not yet supported on Windows.
+ 
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Examining the Reports
+Locate `report.html` in the `loop_ivdep_report.prj/reports/` or `loop_ivdep_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+Navigate to the Loops Analysis section of the optimization report and look at the initiation interval (II) achieved by the two version of the kernel.
+* **`safelen(1)`** The II reported for this version of the kernel is 5 cycles. 
+You should see a message similar to "Compiler failed to schedule this loop with smaller II due to memory dependency."
+* **`safelen(128)`** The II reported for this version of the kernel is 1 cycle, the optimal result. You should  see a message similar to  "a new iteration is issued into the pipelined loop datapath on every cycle".
+
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./loop_ivdep.fpga_emu     (Linux)
+     loop_ivdep.fpga_emu.exe   (Windows)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./loop_ivdep.fpga         (Linux)
+     ```
+
+### Example of Output
+
+```
+SAFELEN: 1 -- kernel time : 50.9517 ms
+Throughput for kernel with SAFELEN 1: 1286KB/s
+SAFELEN: 128 -- kernel time : 10 ms
+Throughput for kernel with SAFELEN 128: 6277KB/s
+PASSED: The results are correct
+```
+
+### Discussion of Results
+
+The following table summarizes the execution time (in ms) and throughput (in MFlops) for `safelen` parameters of 1 (redundant attribute) and 128 (`kRowLength`) for a default input matrix size of 128 x 128 floats on Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA and the Intel® oneAPI DPC++ Compiler (Beta).
+
+Safelen | Kernel Time (ms) | Throughput (KB/s)
+------------- | ------------- | -----------------------
+1     | 50 | 1320
+128   | 10 | 6403
+
+With the `ivdep` attribute applied with the maximum safe `safelen` parameter, the kernel execution time is decreased by a factor of ~5. 
+
+Note that this performance difference will be apparent only when running on FPGA hardware. The emulator, while useful for verifying functionality, will generally not reflect differences in performance.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/loop_ivdep.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/loop_ivdep.sln
new file mode 100755
index 0000000000..5f1a9b42a8
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/loop_ivdep.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "loop_ivdep", "loop_ivdep.vcxproj", "{3F5364B3-F987-4676-89A5-1F19BA3D4B75}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{3F5364B3-F987-4676-89A5-1F19BA3D4B75}.Debug|x64.ActiveCfg = Debug|x64
+		{3F5364B3-F987-4676-89A5-1F19BA3D4B75}.Debug|x64.Build.0 = Debug|x64
+		{3F5364B3-F987-4676-89A5-1F19BA3D4B75}.Release|x64.ActiveCfg = Release|x64
+		{3F5364B3-F987-4676-89A5-1F19BA3D4B75}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {C0550E85-8C31-40EE-BFFA-F267DC16329D}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/loop_ivdep.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/loop_ivdep.vcxproj
new file mode 100755
index 0000000000..ed0fb51757
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/loop_ivdep.vcxproj
@@ -0,0 +1,160 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\loop_ivdep.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{3f5364b3-f987-4676-89a5-1f19ba3d4b75}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>loop_ivdep</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)loop_ivdep.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)loop_ivdep.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/sample.json
new file mode 100755
index 0000000000..b020452df4
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/sample.json
@@ -0,0 +1,51 @@
+{
+  "guid": "CD8FE0A5-B31A-4906-8386-27416361FE24",
+  "name": "Loop IVDep Attribute",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "FPGA tutorial demonstrating the usage of the loop ivdep attribute",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "targetDevice": ["FPGA"],
+  "builder": ["ide", "cmake"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./loop_ivdep.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "loop_ivdep.fpga_emu.exe"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/CMakeLists.txt
new file mode 100755
index 0000000000..03d22779ce
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/CMakeLists.txt
@@ -0,0 +1,89 @@
+set(SOURCE_FILE loop_ivdep.cpp)
+set(TARGET_NAME loop_ivdep)
+
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga")
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+# fpga emulator
+if(WIN32)
+    set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+    add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+    separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+    add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+                   COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+                   DEPENDS ${SOURCE_FILE})
+else()
+    add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+    add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+
+# fpga
+if(WIN32)
+    add_custom_target(fpga
+                  COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+    set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS})
+    set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS})
+endif()
+
+# generate report
+if(WIN32)
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                   DEPENDS ${SOURCE_FILE})
+
+else()
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                   DEPENDS ${SOURCE_FILE})
+endif()
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}.fpga_emu
+                  DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/build.ninja
new file mode 100755
index 0000000000..f076e77a88
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/build.ninja
@@ -0,0 +1,30 @@
+source_file = loop_ivdep.cpp
+target_name = loop_ivdep
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+  command = dpcpp /GX ${emulator_flags} $in -o $out
+
+rule gen_report
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/loop_ivdep.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/loop_ivdep.cpp
new file mode 100755
index 0000000000..f2ddc838ac
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/loop_ivdep/src/loop_ivdep.cpp
@@ -0,0 +1,127 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <iomanip>
+#include <iostream>
+#include "dpc_common.hpp"
+
+constexpr size_t kRowLength = 128;
+constexpr size_t kMinSafelen = 1;
+constexpr size_t kMaxSafelen = kRowLength;
+constexpr size_t kMatrixSize = kRowLength * kRowLength;
+
+using namespace sycl;
+
+template <size_t safe_len> class KernelCompute;
+
+template <size_t safe_len>
+void TransposeAndFold(const device_selector &selector,
+                      const std::array<float, kMatrixSize> &m_input,
+                      std::array<float, kMatrixSize> &m_output) {
+  double kernel_time = 0;
+  try {
+  queue q(selector, dpc_common::exception_handler,
+          property::queue::enable_profiling{});
+
+    buffer buffer_input(m_input);
+    // Use verbose SYCL 1.2 syntax for the output buffer.
+    // (This will become unnecessary in a future compiler version.)
+    buffer<float, 1> buffer_output(m_output.data(), kMatrixSize);
+
+    event e = q.submit([&](handler &h) {
+      auto accessor_input = buffer_input.get_access<access::mode::read>(h);
+      auto accessor_output = buffer_output.get_access<access::mode::discard_write>(h);
+
+      h.single_task<KernelCompute<safe_len>>([=]()
+                                             [[intel::kernel_args_restrict]] {
+        float in_buffer[kRowLength][kRowLength];
+        float temp_buffer[kRowLength][kRowLength];
+
+        // Initialize local buffers 
+        for (size_t i = 0; i < kMatrixSize; i++) {
+          in_buffer[i / kRowLength][i % kRowLength] = accessor_input[i];
+          temp_buffer[i / kRowLength][i % kRowLength] = 0;
+        }
+
+        // No iterations of the following loop store data into the same memory
+        // location that are less than kRowLength iterations apart.
+        // The ivdep here instructs the compiler that it can safely assume no 
+        // loop-carried dependencies over safe_len consecutive iterations.
+        [[intelfpga::ivdep(safe_len)]] 
+        for (size_t j = 0; j < kMatrixSize * kRowLength; j++) {                                
+          #pragma unroll
+          for (size_t i = 0; i < kRowLength; i++) {
+            temp_buffer[j % kRowLength][i] += in_buffer[i][j % kRowLength];
+          }
+        }
+
+        // Write result to output 
+        for (size_t i = 0; i < kMatrixSize; i++) {
+          accessor_output[i] = temp_buffer[i / kRowLength][i % kRowLength];
+        }
+      });
+    });
+
+    double start = e.get_profiling_info<info::event_profiling::command_start>();
+    double end = e.get_profiling_info<info::event_profiling::command_end>();
+
+    // unit is nano second, convert to ms
+    kernel_time = (double)(end - start) * 1e-6;
+
+  } catch (sycl::exception const &e) {
+    // Catches exceptions in the host code
+    std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+
+    // Most likely the runtime couldn't find FPGA hardware!
+    if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+      std::cout << "If you are targeting an FPGA, please ensure that your "
+                   "system has a correctly configured FPGA board.\n";
+      std::cout << "If you are targeting the FPGA emulator, compile with "
+                   "-DFPGA_EMULATOR.\n";
+    }
+    std::terminate();
+  }
+
+  std::cout << "safe_len: " << safe_len << " -- kernel time : " << kernel_time
+            << " ms\n";
+  std::cout << "Throughput for kernel with safe_len " << safe_len << ": ";
+  std::cout << std::fixed << std::setprecision(0)
+            << (((double)kMatrixSize * sizeof(float) * 1e-3f) /
+                (kernel_time * 1e-3f)) << "KB/s\n";
+}
+
+int main() {
+  std::array<float, kMatrixSize> A, B, C;
+
+  // Initialize input with random data
+  for (size_t i = 0; i < kMatrixSize; i++) {
+    A[i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
+  }
+
+#if defined(FPGA_EMULATOR)
+  intel::fpga_emulator_selector selector;
+#else
+  intel::fpga_selector selector;
+#endif
+
+  // Instantiate kernel logic with the min and max correct safelen parameter
+  // to compare performance.
+  TransposeAndFold<kMinSafelen>(selector, A, B);
+  TransposeAndFold<kMaxSafelen>(selector, A, C);
+  // You can also try removing the ivdep from the kernel entirely and 
+  // recompiling to see what effect this has on performance.
+
+  // Verify result
+  for (size_t i = 0; i < kMatrixSize; i++) {
+    if (B[i] != C[i]) {
+      std::cout << "FAILED: The results are incorrect" << '\n';
+      return 1;
+    }
+  }
+  std::cout << "PASSED: The results are correct" << '\n';
+  return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/CMakeLists.txt
new file mode 100755
index 0000000000..e281d8cb1a
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(MaxConcurrency)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/README.md
new file mode 100755
index 0000000000..68d681dd31
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/README.md
@@ -0,0 +1,172 @@
+# Maximum Concurrency of a Loop
+This FPGA tutorial explains how to use the max_concurrency attribute for loops.
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. 
+
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               | The basic usage of the `max_concurrency` attribute <br> How the `max_concurrency` attribute affects loop throughput and resource use <br> How to apply the `max_concurrency` attribute to loops in your program <br> How to identify the correct `max_concurrency` factor for your program
+| Time to complete                  | 15 minutes
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+This tutorial demonstrates a simple example of applying the `max_concurrency` attribute to a loop in a task kernel to trade off the on-chip memory use and throughput of the loop.
+
+### Description of the `max_concurrency` Attribute
+The `max_concurrency` attribute is a loop attribute that enables you to control the number of simultaneously executed loop iterations. To enable this simultaneous execution, the compiler creates copies of any memory that is private to a single iteration. These copies are called private copies. The greater the permitted concurrency, the more private copies the compiler must create. 
+
+#### Example: 
+
+Kernels in this tutorial design apply `[[intelfpga::max_concurrency(N)]]` to an outer loop that contains two inner loops, which perform a partial sum computation on an input array, storing the results in a private (to the outer loop) array `a1`. The following is an example of a loop nest:
+
+```
+[[intelfpga::max_concurrency(1)]]
+for (size_t i = 0; i < max_iter; i++) {                                                      
+  float a1[size];                                                                              
+  for (int j = 0; j < size; j++)                                                               
+    a1[j] = accessorA[i * 4 + j] * shift;                                                      
+  for (int j = 0; j < size; j++)                                                               
+    result += a1[j];                                                                           
+}   
+```
+
+In this example, the maximum concurrency allowed for the outer loop is 1, that is, only one iteration of the outer loop is allowed to be simultaneously executing at any given moment. The `max_concurrency` attribute in this example forces the compiler to create exactly one private copy of the array `a1`. Passing the parameter `N` to the `max_concurrency` attribute limits the concurrency of the loop to `N` simultaneous iterations, and `N` private copies of privately-declared arrays in that loop.
+
+### Identifying the Correct `max_concurrency` Factor
+Generally, increasing the maximum concurrency allowed for a loop through the use of the `max_concurrency` attribute increases the throughput of that loop at the cost of increased memory resource use. Additionally, in nearly all cases, there is a point at which increasing the maximum concurrency does not have any further effect on the throughput of the loop, as the maximum exploitable concurrency of that loop has been achieved. 
+
+The correct `max_concurrency` factor for a loop depends on the goals of your design, the criticality of the loop in question, and its impact on the overall throughput of your design. A typical design flow may be to: 
+1. Experiment with different values of `max_concurrency`. 
+2. Observe what impact the values have on the overall throughput and memory use of your design.
+3. Choose the appropriate value that allows you to achive your desired throughput and area goals.
+
+## Key Concepts
+* The basic usage of the `max_concurrency` attribute 
+* How the `max_concurrency` attribute affects loop throughput and resource use
+* How to apply the `max_concurrency` attribute to loops in your program
+* How to identify the correct `max_concurrency` factor for your program
+
+## License  
+This code sample is licensed under MIT license.
+
+## Building the `max_concurrency` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the optimization report: 
+     ```
+     make report
+     ``` 
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the FPGA hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/max_concurrency.fpga.tar.gz" download>here</a>.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. 
+
+1. Enter the source file directory.
+   ```
+   cd src
+   ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      ninja fpga_emu
+      ```
+
+   * Generate the optimization report:
+
+     ```
+     ninja report
+     ```
+     If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+     ```
+     ninja report_s10_pac
+     ```     
+   * Compiling for FPGA hardware is not yet supported on Windows.
+ 
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Examining the Reports
+Locate `report.html` in the `max_concurrency_report.prj/reports/` or `max_concurrency_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+On the main report page, scroll down to the section titled "Estimated Resource Usage". Each kernel name ends in the max_concurrency attribute argument used for that kernel, e.g., `kernelCompute1` uses a max_concurrency attribute value of 1. You can verify that the number of RAMs used for each kernel increases with the max_concurrency value used, with the exception of max_concurrency 0, which instructs the compiler to choose a default value.
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./max_concurrency.fpga_emu     (Linux)
+     max_concurrency.fpga_emu.exe   (Windows)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./max_concurrency.fpga         (Linux)
+     ```
+
+
+### Example of Output
+```
+Max concurrency 0 kernel time : 1459.89 ms
+Throughput for kernel with max_concurrency 0: 0.561 GFlops
+Max concurrency 1 kernel time : 2890.810 ms
+Throughput for kernel with max_concurrency 1: 0.283 GFlops
+Max concurrency 2 kernel time : 1460.227 ms
+Throughput for kernel with max_concurrency 2: 0.561 GFlops
+Max concurrency 4 kernel time : 1459.970 ms
+Throughput for kernel with max_concurrency 4: 0.561 GFlops
+Max concurrency 8 kernel time : 1460.034 ms
+Throughput for kernel with max_concurrency 8: 0.561 GFlops
+Max concurrency 16 kernel time : 1459.901 ms
+Throughput for kernel with max_concurrency 16: 0.561 GFlops
+PASSED: The results are correct
+```
+
+### Discussion of Results
+
+The stdout output shows the giga-floating point operations per second (GFlops) for each kernel. 
+
+When run on the Intel® PAC with Intel Arria10® 10 GX FPGA hardware board, we see that the throughput doubles from using max_concurrency 1 to max_concurrency 2, after which increasing the value of max_concurrency does not increase the GFlops achieved, i.e., increasing the max_concurrency above 2 will spend additional RAM resources for no additional throughput gain. As such, for this tutorial design, maximal throughput is best achieved by using max_concurrency 2. 
+
+Using max_concurrency 0 (or equivalently omitting the attribute entirely) also produced good throughput, indicating that the compiler's default heuristic chose a concurrency of 2 or higher in this case.
+
+When run on the FPGA emulator, the max_concurrency attribute has no effect on runtime. You may notice that the emulator achieved higher throughput than the FPGA in this example. This is because this trivial example uses only a tiny fraction of the spacial compute resources available on the FPGA.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/max_concurrency.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/max_concurrency.sln
new file mode 100755
index 0000000000..761fdc2009
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/max_concurrency.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "max_concurrency", "max_concurrency.vcxproj", "{F0CE4972-62AF-4B9F-996F-1D1DB14D76B7}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{F0CE4972-62AF-4B9F-996F-1D1DB14D76B7}.Debug|x64.ActiveCfg = Debug|x64
+		{F0CE4972-62AF-4B9F-996F-1D1DB14D76B7}.Debug|x64.Build.0 = Debug|x64
+		{F0CE4972-62AF-4B9F-996F-1D1DB14D76B7}.Release|x64.ActiveCfg = Release|x64
+		{F0CE4972-62AF-4B9F-996F-1D1DB14D76B7}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {3247AB7C-282F-4907-B1F4-E944349A8835}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/max_concurrency.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/max_concurrency.vcxproj
new file mode 100755
index 0000000000..49b65a1722
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/max_concurrency.vcxproj
@@ -0,0 +1,160 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\max_concurrency.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{f0ce4972-62af-4b9f-996f-1d1db14d76b7}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>max_concurrency</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)max_concurrency.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)max_concurrency.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/sample.json
new file mode 100755
index 0000000000..9a9253ef17
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/sample.json
@@ -0,0 +1,51 @@
+{
+  "guid": "7909FAE1-D3D4-4E97-A963-14A884F33495",
+  "name": "Maximum Concurrency of a Loop",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "How to use the max_concurrency attribute for single_task loops",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "targetDevice": ["FPGA"],
+  "builder": ["ide", "cmake"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./max_concurrency.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "max_concurrency.fpga_emu.exe"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/CMakeLists.txt
new file mode 100755
index 0000000000..24d6d8302a
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/CMakeLists.txt
@@ -0,0 +1,90 @@
+set(SOURCE_FILE max_concurrency.cpp)
+set(TARGET_NAME max_concurrency)
+
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+set(AOC_SEED_FLAG "-Xsseed=4 -Xsparallel=2")
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga")
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${AOC_SEED_FLAG} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+# fpga emulator
+if(WIN32)
+    set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+    add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+    separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+    add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+                   COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+                   DEPENDS ${SOURCE_FILE})
+else()
+    add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+    add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+
+# fpga
+if(WIN32)
+    add_custom_target(fpga
+                  COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+    set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS})
+    set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS})
+endif()
+
+# generate report
+if(WIN32)
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                   DEPENDS ${SOURCE_FILE})
+
+else()
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                   DEPENDS ${SOURCE_FILE})
+endif()
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}.fpga_emu
+                  DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/build.ninja
new file mode 100755
index 0000000000..b3a66f686b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/build.ninja
@@ -0,0 +1,30 @@
+source_file = max_concurrency.cpp
+target_name = max_concurrency
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware -Xsseed=3
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+  command = dpcpp /GX ${emulator_flags} $in -o $out
+
+rule gen_report
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/max_concurrency.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/max_concurrency.cpp
new file mode 100755
index 0000000000..cec706dd17
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/max_concurrency/src/max_concurrency.cpp
@@ -0,0 +1,187 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <array>
+#include <iomanip>
+#include <iostream>
+#include "dpc_common.hpp"
+
+using namespace sycl;
+
+constexpr size_t kSize = 8192;
+constexpr size_t kMaxIter = 50000;
+constexpr size_t kTotalOps = 2 * kMaxIter * kSize;
+constexpr size_t kMaxValue = 128;
+
+using FloatArray = std::array<float, kSize>;
+using FloatScalar = std::array<float, 1>;
+
+template <int concurrency> class Compute;
+
+// Launch a kernel on the device specified by selector.
+// The kernel's functionality is designed to show the
+// performance impact of the max_concurrency attribute.
+template <int concurrency>
+void PartialSumWithShift(const device_selector &selector,
+                         const FloatArray &array, float shift,
+                         FloatScalar &result) {
+  double kernel_time = 0.0;
+
+  try {
+
+    queue q(selector, dpc_common::exception_handler,
+            property::queue::enable_profiling{});
+
+    buffer buffer_array(array);
+    buffer<float, 1> buffer_result(result.data(), 1);
+
+    event e = q.submit([&](handler &h) {
+      auto accessor_array = buffer_array.get_access<access::mode::read>(h);
+      auto accessor_result = buffer_result.get_access<access::mode::discard_write>(h);
+
+      h.single_task<Compute<concurrency>>([=]()
+                                          [[intel::kernel_args_restrict]] {
+        float r = 0;
+
+        // At most concurrency iterations of the outer loop will be
+        // active at one time.
+        // This limits memory usage, since each iteration of the outer
+        // loop requires its own copy of a1.
+        [[intelfpga::max_concurrency(concurrency)]]
+        for (size_t i = 0; i < kMaxIter; i++) {
+          float a1[kSize];
+          for (size_t j = 0; j < kSize; j++)
+            a1[j] = accessor_array[(i * 4 + j) % kSize] * shift;
+          for (size_t j = 0; j < kSize; j++)
+            r += a1[j];
+        }
+        accessor_result[0] = r;
+      });
+    });
+
+    // SYCL event profiling allows the kernel execution to be timed
+    double start = e.get_profiling_info<info::event_profiling::command_start>();
+    double end = e.get_profiling_info<info::event_profiling::command_end>();
+    kernel_time = (double)(end - start) * 1e-6;
+
+  } catch (sycl::exception const &e) {
+    // Catches exceptions in the host code
+    std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+
+    // Most likely the runtime couldn't find FPGA hardware!
+    if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+      std::cout << "If you are targeting an FPGA, please ensure that your "
+                   "system has a correctly configured FPGA board.\n";
+      std::cout << "If you are targeting the FPGA emulator, compile with "
+                   "-DFPGA_EMULATOR.\n";
+    }
+    std::terminate();
+  }
+
+  // The performance of the kernel is measured in GFlops, based on:
+  // 1) the number of floating-point operations performed by the kernel.
+  //    This can be calculated easily for the simple example kernel.
+  // 2) the kernel execution time reported by SYCL event profiling.
+  std::cout << "Max concurrency " << concurrency << " "
+            << "kernel time : " << kernel_time << " ms\n";
+  std::cout << "Throughput for kernel with max_concurrency " << concurrency
+            << ": ";
+  std::cout << std::fixed << std::setprecision(3)
+            << ((double)(kTotalOps) / kernel_time) / 1e6f << " GFlops\n";
+}
+
+// Calculates the expected results. Used to verify that the kernel
+// is functionally correct. 
+float GoldenResult(const FloatArray &A, float shift) {
+  float gr = 0;
+  for (size_t i = 0; i < kMaxIter; i++) {
+    float a1[kSize];
+    for (size_t j = 0; j < kSize; j++)
+      a1[j] = A[(i * 4 + j) % kSize] * shift;
+    for (size_t j = 0; j < kSize; j++)
+      gr += a1[j];
+  }
+  return gr;
+}
+
+int main() {
+  bool success = true;
+
+  FloatArray A;
+  FloatScalar R0, R1, R2, R3, R4, R5;
+
+  float shift = (float)(rand() % kMaxValue);
+
+  // initialize the input data
+  for (size_t i = 0; i < kSize; i++)
+    A[i] = rand() % kMaxValue;
+
+#if defined(FPGA_EMULATOR)
+  intel::fpga_emulator_selector selector;
+#else
+  intel::fpga_selector selector;
+#endif
+
+  // Run the kernel with different values of the max_concurrency
+  // attribute, to determine the optimal concurrency. 
+  // In this case, the optimal max_concurrency is 2 since this 
+  // achieves the highest GFlops. Higher values of max_concurrency
+  // consume additional RAM without increasing GFlops.
+  PartialSumWithShift<0>(selector, A, shift, R0);
+  PartialSumWithShift<1>(selector, A, shift, R1);
+  PartialSumWithShift<2>(selector, A, shift, R2);
+  PartialSumWithShift<4>(selector, A, shift, R3);
+  PartialSumWithShift<8>(selector, A, shift, R4);
+  PartialSumWithShift<16>(selector, A, shift, R5);
+
+  // compute the actual result here
+  float gr = GoldenResult(A, shift);
+
+  // verify the results are correct
+  if (gr != R0[0]) {
+    std::cout << "Max Concurrency 0: mismatch: " << R0[0] << " != " << gr
+              << " (kernel != expected)" << '\n';
+    success = false;
+  }
+
+  if (gr != R1[0]) {
+    std::cout << "Max Concurrency 1: mismatch: " << R1[0] << " != " << gr
+              << " (kernel != expected)" << '\n';
+    success = false;
+  }
+
+  if (gr != R2[0]) {
+    std::cout << "Max Concurrency 2: mismatch: " << R2[0] << " != " << gr
+              << " (kernel != expected)" << '\n';
+    success = false;
+  }
+
+  if (gr != R3[0]) {
+    std::cout << "Max Concurrency 4: mismatch: " << R3[0] << " != " << gr
+              << " (kernel != expected)" << '\n';
+    success = false;
+  }
+
+  if (gr != R4[0]) {
+    std::cout << "Max Concurrency 8: mismatch: " << R4[0] << " != " << gr
+              << " (kernel != expected)" << '\n';
+    success = false;
+  }
+
+  if (gr != R5[0]) {
+    std::cout << "Max Concurrency 16: mismatch: " << R5[0] << " != " << gr
+              << " (kernel != expected)" << '\n';
+    success = false;
+  }
+
+  if (success) {
+    std::cout << "PASSED: The results are correct\n";
+    return 0;
+  }
+
+  return 1;
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/CMakeLists.txt
new file mode 100755
index 0000000000..125d32c072
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(MemoryAttributesOverview)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/README.md
new file mode 100755
index 0000000000..1884982c77
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/README.md
@@ -0,0 +1,277 @@
+
+# On-Chip Memory Attributes
+This FPGA tutorial demonstrates how to use on-chip memory attributes to control memory structures in your DPC++ program.
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. 
+
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               |  The basic concepts of on-chip memory attributes <br> How to apply memory attributes in your program <br> How to confirm that the memory attributes were respected by the compiler <br> A case study of the type of performance/area trade-offs enabled by memory attributes 
+| Time to complete                  | 30 minutes
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+For each private or local array in your DPC++ FPGA device code, the Intel® oneAPI DPC++ Compiler creates a custom memory system in your program's datapath to contain the contents of that array. The compiler has many options to choose from when architecting this on-chip memory structure. Memory attributes are a set of DPC++ extensions for FPGA that enable you to override the compiler's internal heuristics and to control the architecture of kernel memory.
+
+### Introduction to Memory Attributes
+
+To maximize kernel throughput, your design's datapath should have stall-free accesses to all of its memory systems. A memory read or write is said to be *stall-free* if the compiler can prove that it has contention-free access to a memory port. A memory system is stall-free if all of its accesses have this property. Wherever possible, the compiler will try to create a minimum-area, stall-free memory system. 
+
+If a different area performance trade-off is desired, or if the compiler fails to find the best configuration, you can use memory attributes to override the compiler’s decisions and specify the memory configuration you need.
+
+Memory attributes can be applied to any variable or array defined within the kernel and to struct data members in struct declarations. The compiler supports the following memory attributes:
+
+| Memory Attribute                 | Description
+---                                |---
+| intelfpga::register              | Forces a variable or array to be carried through the pipeline in registers.
+| intelfpga::memory("`impl_type`") | Forces a variable or array to be implemented as embedded memory. The optional string parameter `impl_type` can be `BLOCK_RAM` or `MLAB`.
+| intelfpga::numbanks(N)           | Specifies that the memory implementing the variable or array must have N memory banks. 
+| intelfpga::bankwidth(W)          | Specifies that the memory implementing the variable or array must be W bytes wide.
+| intelfpga::singlepump            | Specifies that the memory implementing the variable or array should be clocked at the same rate as the accesses to it.
+| intelfpga::doublepump            | Specifies that the memory implementing the variable or array should be clocked at twice the rate as the accesses to it.
+| intelfpga::max_replicates(N)     | Specifies that a maximum of N replicates should be created to enable simultaneous reads from the datapath.
+| intelfpga::private_copies(N)     | Specifies that a maximum of N private copies should be created to enable concurrent execution of N pipelined threads.
+| intelfpga::simple_dual_port      | Specifies that the memory implementing the variable or array should have no port that services both reads and writes.
+| intelfpga::merge("`key`", "`type`")  | Merge two or more variables or arrays in the same scope width-wise or depth-wise. All variables with the same `key` string are merged into the same memory system. The string `type` can be either `width` or `depth`. 
+| intelfpga::bank_bits(b<sub>0</sub>,b<sub>1</sub>,...,b<sub>n</sub>)  | Specifies that the local memory addresses should use bits (b<sub>0</sub>,b<sub>1</sub>,...,b<sub>n</sub>) for bank-selection, where (b<sub>0</sub>,b<sub>1</sub>,...,b<sub>n</sub>) are indicated in terms of word-addressing. The bits of the local memory address not included in (b<sub>0</sub>,b<sub>1</sub>,...,b<sub>n</sub>) will be used for word-selection in each bank. 
+
+
+#### Example 1: Applying memory attributes to private arrays
+```c++
+q.submit([&](handler &h) {
+  h.single_task<class Example1>([=]() {
+    // Create a kernel memory 8 bytes wide (2 integers per memory word)
+    // and split the contents into 2 banks (each bank will contain 32
+    // integers in 16 memory words). 
+    [[intelfpga::bankwidth(8), intelfpga::numbanks(2)]] int a[64];
+    
+    // Force array 'b' to be carried live in the data path using
+    // registers. 
+    [[intelfpga::register]] int b[64];
+
+    // Merge 'mem_A' and 'mem_B' width-wise so that they are mapped
+    // to the same kernel memory system,
+    [[intelfpga::merge("mem", "width")]] unsigned short mem_A[64];
+    [[intelfpga::merge("mem", "width")]] unsigned short mem_B[64];
+    
+    // ...
+  });
+});
+
+```
+
+#### Example 2: Applying memory attributes to struct data members
+```c++
+// Memory attributes can be specified for struct data members
+// within the struct declaration.
+struct State {
+  [[intelfpga::numbanks(2)]] int mem[64];
+  [[intelfpga::register]]    int reg[8];
+};
+
+q.submit([&](handler &h) {
+  h.single_task<class Example2>([=]() {
+    // The compiler will create two memory systems from S1:
+    //  - S1.mem[64] implemented in kernel memory that has 2 banks
+    //  - S1.reg[8] implemented in registers 
+    State S1;
+    
+    // In this case, we have attributes on struct declaration as
+    // well as struct instantiation. When this happpens, the outer
+    // level attribute takes precendence. Here, the compiler will
+    // generate a single memory system for S2 which will have 4
+    // banks.  
+    [[intelfpga::numbanks(4)]] State S2;
+
+    // ...
+  });
+});
+
+```
+
+### Tutorial Code Overview
+This tutorial demonstrates the trade-offs between choosing a single-pumped and double-pumped memory system for your kernel. We will apply the attributes `[[intelfpga::singlepump]]` and `[[intelfpga::doublepump]]` to the two dimensional array `dict_offset`. 
+
+The tutorial enqueues three versions of the same kernel:
+* `dict_offset` is single-pumped
+* `dict_offset` is double-pumped
+* `dict_offset` unconstrained (compiler heuristics choose the memory configuration)
+
+For both single-pumped and double-pumped versions, additional memory attributes direct the compiler to implement `dict_offset` in MLABs (as the size of the array is small), to using `kVec` banks, and to confine the number of replicates in each bank to no more than `kVec`. 
+
+### Accesses to `dict_offset`
+
+Array `dict_offset` has the following accesses:
+
+ * **Initialization**: It is initialized by copying the contents of global memory `dict_offset_init` using `kVec` writes.
+ * **Reads** : It is read from `kVec*kVec` times. 
+ * **Writes**: There are `kVec` writes updating the values at some indices.
+
+After all loops are unrolled, the innermost dimension of every access is known at compile time (e.g. `dict_offset[i][k]` becomes `dict_offset[i][0]`, `dict_offset[i][1]`, etc.). 
+
+### Banks and replicates of `dict_offset`
+
+If we partition the memory system such that array elements `dict_offset[:][0]` (where `:` denotes all indices in range) are contained in Bank 0, `dict_offset[:][1]` are contained in Bank 1, and so on, each access is confined to a single bank. This partitioning is achieved by requesting the compiler to generate `kVec` banks.
+
+In total, there are `kVec` reads from each bank. To make these reads stall-free, we request `kVec` replicates per bank so that (if needed) each read can occur simultaneously from a separate replicate. Since all replicates in a bank must contain identical data, a write to a bank must go to all replicates. 
+
+For single-pumped memories, each replicate has 2 physical ports. In the tutorial code, one of these ports is used for writing and one for reading. The compiler must generate `kVec` replicates per bank to create stall-free accesses for `kVec` reads. 
+
+For double-pumped memories, each replicate effectively has 4 ports, three of which are available for reads. Hence, the compiler needs fewer replicates per bank to create stall-free reads. However, this can incur a system f<sub>MAX</sub> penalty.
+
+The choice of attributes will be further discussed in the [Examining the Reports](#examining-the-reports) section.
+
+
+## Key Concepts
+* The basic concepts of on-chip memory attributes 
+* How to apply memory attributes in your program 
+* How to confirm that the memory attributes were respected by the compiler 
+* A case study of the type of performance/area trade-offs enabled by memory attributes 
+
+## License  
+This code sample is licensed under MIT license.
+
+## Building the `memory_attributes` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the optimization report: 
+     ```
+     make report
+     ``` 
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/memory_attributes.fpga.tar.gz" download>here</a>.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. 
+
+1. Enter the source file directory.
+   ```
+   cd src
+   ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      ninja fpga_emu
+      ```
+
+   * Generate the optimization report:
+
+     ```
+     ninja report
+     ```
+     If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+     ```
+     ninja report_s10_pac
+     ```     
+   * Compiling for FPGA hardware is not yet supported on Windows.
+ 
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Examining the Reports
+Locate `report.html` in the `memory_attributes_report.prj/reports/` or `memory_attributes_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+Navigate to the Kernel Memory Viewer (System Viewers > Kernel Memory Viewer). In the Kernel Memory List pane, click on `dict_offset` under the function `KernelCompute<N>`, for each of
+* N=0 : unconstrained configuration (compiler's choice)
+* N=1 : single-pumped configuration
+* N=2 : double-pumped configuration
+
+This view provides information about the memory configuration. The user-specified memory attributes are listed in the "Details" pane.
+
+### Comparing the memory configurations
+
+For both single-pumped and double-pumped versions of the kernel, the compiler generates `kVec` banks and implements the memory in MLABs, as was requested through memory attributes. The main difference between these two memory systems is the number of replicates within each bank. To see the number of replicates per bank, click any bank label (say Bank 0) under `dict_offset`. 
+
+For the single-pumped memory system, the compiler created 4 replicates per bank, whereas for the double-pumped memory system, the compiler created 2 replicates per bank. A single-pumped replicate has 2 physical ports and a double-pumped replicates has 4 (effective) physical ports. For this reason, the compiler required twice as many replicates to create a stall-free system in the single-pumped version as compared to the double-pumped version. 
+
+### Area implications
+
+This also means that the FPGA resources needed to generate the stall-free memory systems differ between the two versions. In the report, navigate to the Area Analysis of System view (Area Analysis > Area Analysis of System) and click "Expand All". For the single-pumped version, you can see that the compiler used 32 MLABs to implement the memory system for `dict_offset`, whereas for the double-pumped version, the compiler used only 16 MLABs. However, the double-pumped version of the memory required additional ALUTs and FFs to implement the double-pumping logic. 
+
+In general, double-pumped memories are more area-efficient than single-pumped memories.
+
+### f<sub>MAX</sub> implications
+
+The use of double-pumped memories can impact the f<sub>MAX</sub> of your system. Double-pumped memories have to be clocked at twice the frequency of the rest of the datapath, and the resulting cross-clock domain transfer can reduce f<sub>MAX</sub>. The effect is particularly pronounced when double-pumping MLABs.
+
+In this tutorial, both the single-pumped and double-pumped version of the kernel share a single clock domain, so the difference in f<sub>MAX</sub> cannot be directly observed in the report. 
+
+If you want to observe the f<sub>MAX</sub> effect, modify the code to enqueue only the single-pumped (or only the double-pumped) version of the kernel. Only the report generated from a full FPGA compile (`make fpga`) will provide f<sub>MAX</sub> information.
+
+The table that follows summarizes the f<sub>MAX</sub> achieved when compiling single-kernel variants of the tutorial design to an on Intel® PAC with Intel® Arria® 10 GX FPGA.
+
+Variant  | Fmax (MHz) | \# MLABs in `dict_offset`
+------------- | ------------- | --------
+Single-pumped  | 307.9 | 32 
+Double-pumped  | 200.0 | 16 
+
+Note that the numbers reported in the table will vary slightly from compile to compile.
+
+### Trade-offs
+There are often many ways to generate a stall-free memory system. As a programmer, the implementation you choose depends on your design constraints.
+
+ - If your design is limited by the available memory resources (block RAMs and MLABs), using double-pumped memory systems can help your design fit in the FPGA device.
+ - If the f<sub>MAX</sub> of your design is limited by double-pumped memory systems in your kernel, forcing all memory systems to be single-pumped might increase the f<sub>MAX</sub>.
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./memory_attributes.fpga_emu     (Linux)
+     memory_attributes.fpga_emu.exe   (Windows)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./memory_attributes.fpga         (Linux)
+     ```
+
+### Example of Output
+```
+PASSED: all kernel results are correct.
+```
+
+### Discussion
+
+Feel free to experiment further with the tutorial code. You can:
+ - Change the memory implementation type to block RAMs (using `[[intelfpga::memory("BLOCK_RAM")]]`) or registers (using `[[intelfpga::register]]`) to see how it affects the area and f<sub>MAX</sub> of the tutorial design.
+ - Vary `kRows` and/or `kVec` (both in powers of 2) to see how it effects the trade-off between single-pumped and double-pumped memories.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/memory_attributes.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/memory_attributes.sln
new file mode 100755
index 0000000000..3f1de9b8b8
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/memory_attributes.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "memory_attributes", "memory_attributes.vcxproj", "{D6A634E7-9F2B-46C2-A21C-2402F631A55A}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{D6A634E7-9F2B-46C2-A21C-2402F631A55A}.Debug|x64.ActiveCfg = Debug|x64
+		{D6A634E7-9F2B-46C2-A21C-2402F631A55A}.Debug|x64.Build.0 = Debug|x64
+		{D6A634E7-9F2B-46C2-A21C-2402F631A55A}.Release|x64.ActiveCfg = Release|x64
+		{D6A634E7-9F2B-46C2-A21C-2402F631A55A}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {4AC13DD2-5B0F-4051-93BF-85AEAF6E50C9}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/memory_attributes.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/memory_attributes.vcxproj
new file mode 100755
index 0000000000..f797c91ef5
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/memory_attributes.vcxproj
@@ -0,0 +1,160 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\memory_attributes.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{d6a634e7-9f2b-46c2-a21c-2402f631a55a}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>memory_attributes</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)memory_attributes.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)memory_attributes.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/sample.json
new file mode 100755
index 0000000000..8c18593331
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/sample.json
@@ -0,0 +1,51 @@
+{
+  "guid": "31BCA673-F514-4E2E-A8B3-A0B42D63884C",
+  "name": "On-Chip Memory Attributes",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "FPGA tutorial demonstrating the use of memory attributes to control memory structures in a DPC++ program.",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "targetDevice": ["FPGA"],
+  "builder": ["ide", "cmake"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./memory_attributes.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "memory_attributes.fpga_emu.exe"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/CMakeLists.txt
new file mode 100755
index 0000000000..290fd004e5
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/CMakeLists.txt
@@ -0,0 +1,96 @@
+set(SOURCE_FILE memory_attributes.cpp)
+set(TARGET_NAME memory_attributes)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga")
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+# fpga emulator
+if(WIN32)
+    set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+    add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+    separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+    add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+             COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+             DEPENDS ${SOURCE_FILE})
+
+else()
+    add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+    add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpga
+if(WIN32)
+    add_custom_target(fpga
+            COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+
+    set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS "${HARDWARE_COMPILE_FLAGS}")
+    set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS})
+
+endif()
+
+# report
+
+if(WIN32)
+    set(REPORT ${TARGET_NAME}_report.a)
+
+    add_custom_target(report DEPENDS ${REPORT})
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${CMAKE_BINARY_DIR}/${TARGET_NAME}/${SOURCE_FILE} COPYONLY)
+
+    add_custom_command(OUTPUT ${REPORT}
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_BINARY_DIR}/${TARGET_NAME}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORT}
+                 DEPENDS ${SOURCE_FILE})
+
+else()
+    set(REPORT ${TARGET_NAME}_report.a)
+
+    add_custom_target(report DEPENDS ${REPORT})
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${REPORT}
+                 COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${REPORT}
+                 DEPENDS ${SOURCE_FILE})
+endif()
+
+# run
+add_custom_target(run
+            COMMAND ../${TARGET_NAME}.fpga_emu
+            DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/build.ninja
new file mode 100755
index 0000000000..5a8b871482
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/build.ninja
@@ -0,0 +1,41 @@
+source_file = memory_attributes.cpp
+target_name = memory_attributes
+
+emulator_target = ${target_name}.fpga_emu.exe
+
+hardware_flags = -fintelfpga -Xshardware
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+  command = dpcpp /GX ${emulator_flags} $in -o $out
+
+rule gen_report_1x
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -DSINGLEPUMP -o $out
+
+rule gen_report_2x
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -DDOUBLEPUMP -o $out
+
+rule gen_report_1x_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -DSINGLEPUMP -o $out
+
+rule gen_report_2x_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -DDOUBLEPUMP -o $out
+
+# FPGA emulator 
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+report_target_1x = singlepump_report.a 
+report_target_2x = doublepump_report.a
+report_target_1x_s10_pac = singlepump_s10_pac_report.a 
+report_target_2x_s10_pac = doublepump_s10_pac_report.a
+
+build report: phony ${report_target_1x} ${report_target_2x}
+build ${report_target_1x}: gen_report_1x ${source_file} 
+build ${report_target_2x}: gen_report_2x ${source_file} 
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_1x_s10_pac} ${report_target_2x_s10_pac}
+build ${report_target_1x_s10_pac}: gen_report_1x_s10_pac ${source_file} 
+build ${report_target_2x_s10_pac}: gen_report_2x_s10_pac ${source_file} 
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/memory_attributes.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/memory_attributes.cpp
new file mode 100755
index 0000000000..f1fa9afb3a
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/memory_attributes/src/memory_attributes.cpp
@@ -0,0 +1,227 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include "dpc_common.hpp"
+
+using namespace sycl;
+
+constexpr size_t kRows = 8;
+constexpr size_t kVec = 4;
+constexpr size_t kMaxVal = 512;
+constexpr size_t kNumTests = 64;
+constexpr size_t kMaxIter = 8;
+
+// Forward declaration of the kernel name
+// (This will become unnecessary in a future compiler version.)
+template<int attr_type>
+class KernelCompute;
+
+using UintArray = std::array<unsigned, kVec>;
+using Uint2DArray = std::array<std::array<unsigned, kVec>, kRows>;
+using UintSQArray = std::array<std::array<unsigned, kVec>, kVec>; // square
+
+// The shared compute function for host and device code
+size_t Compute(unsigned init, Uint2DArray &dict_offset) {
+
+  // We do not provide any attributes for compare_offset and hash;
+  // we let the compiler decide what's best based on the access pattern
+  // and their size.
+  UintSQArray compare_offset;
+  UintArray hash;
+
+  #pragma unroll
+  for (size_t i = 0; i < kVec; i++) {
+    hash[i] = (++init) & (kRows - 1);
+  }
+
+  size_t count = 0, iter = 0;
+  do {
+    // After unrolling both loops, we have kVec*kVec reads from dict_offset
+    #pragma unroll
+    for (size_t i = 0; i < kVec; i++) {
+      #pragma unroll
+      for (size_t k = 0; k < kVec; ++k) {
+        compare_offset[k][i] = dict_offset[hash[i]][k];
+      }
+    }
+
+    // After unrolling, we have kVec writes to dict_offset
+    #pragma unroll
+    for (size_t k = 0; k < kVec; ++k) {
+      dict_offset[hash[k]][k] = (init << k);
+    }
+    init++;
+
+    #pragma unroll
+    for (size_t i = 0; i < kVec; i++) {
+      #pragma unroll
+      for (size_t k = 0; k < kVec; ++k) {
+        count += compare_offset[i][k];
+      }
+    }
+  } while (++iter < kMaxIter);
+  return count;
+}
+
+// Declare a 2D array with memory attribute 'doublepump' if
+// attr_type=2, attribute 'singlepump' if attr_type=1,
+// and no memory attributes otherwise
+template<int attr_type>
+Uint2DArray CreateDictOffset() {
+  if (attr_type == 1) {
+
+    // The memory attributes apply to the array's declaration
+    [[intelfpga::singlepump, intelfpga::memory("MLAB"),
+      intelfpga::numbanks(kVec), intelfpga::max_replicates(kVec)]]
+    Uint2DArray dict_offset;
+
+    return dict_offset;
+
+  } else if (attr_type == 2) {
+
+    [[intelfpga::doublepump, intelfpga::memory("MLAB"),
+      intelfpga::numbanks(kVec), intelfpga::max_replicates(kVec)]]
+    Uint2DArray dict_offset;
+
+    return dict_offset;
+  }
+
+  return Uint2DArray{};
+}
+
+template<int attr_type>
+unsigned RunKernel(unsigned init, const unsigned dict_offset_init[]) {
+  unsigned result = 0;
+
+#if defined(FPGA_EMULATOR)
+  intel::fpga_emulator_selector device_selector;
+#else
+  intel::fpga_selector device_selector;
+#endif
+
+  try {
+    queue q(device_selector, dpc_common::exception_handler);
+
+    // Flatten the 2D array to a 1D buffer, because the
+    // buffer constructor requires a pointer to input data
+    // that is contiguous in memory.
+    buffer<unsigned, 1> buffer_d(dict_offset_init,
+                                 range<1>(kRows * kVec));
+    buffer<unsigned, 1> buffer_r(&result, 1);
+
+    auto e = q.submit([&](handler &h) {
+      auto accessor_d = buffer_d.get_access<access::mode::read>(h);
+      auto accessor_r = buffer_r.get_access<access::mode::discard_write>(h);
+
+      h.single_task<KernelCompute<attr_type>>(
+                    [=]() [[intel::kernel_args_restrict]] {
+
+        // Declare 'dict_offset' to be single or double pumped
+        Uint2DArray dict_offset = CreateDictOffset<attr_type>();
+
+        // Initialize 'dict_offset' with values from global memory.
+        for (size_t i = 0; i < kRows; ++i) {
+          #pragma unroll
+          for (size_t k = 0; k < kVec; ++k) {
+            // After unrolling, we end up with kVec writes to dict_offset.
+            dict_offset[i][k] = accessor_d[i * kVec + k];
+          }
+        }
+        accessor_r[0] = Compute(init, dict_offset);
+      });
+    });
+
+  } catch (sycl::exception const &e) {
+    // Catches exceptions in the host code
+    std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+
+    // Most likely the runtime couldn't find FPGA hardware!
+    if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+      std::cout << "If you are targeting an FPGA, please ensure that your "
+                   "system has a correctly configured FPGA board.\n";
+      std::cout << "If you are targeting the FPGA emulator, compile with "
+                   "-DFPGA_EMULATOR.\n";
+    }
+    std::terminate();
+  }
+
+  return result;
+}
+
+// This host side function performs the same computation as the device side
+// kernel, and is used to verify functional correctness.
+unsigned GoldenRun(unsigned init, unsigned const dict_offset_init[]) {
+  Uint2DArray dict_offset;
+  for (size_t i = 0; i < kRows; ++i) {
+    for (size_t k = 0; k < kVec; ++k) {
+      dict_offset[i][k] = dict_offset_init[i * kVec + k];
+    }
+  }
+  return Compute(init, dict_offset);
+}
+
+int main() {
+  srand(0);
+
+  Uint2DArray dict_offset_init;
+
+  bool passed = true;
+
+  for (size_t j = 0; j < kNumTests; j++) {
+    unsigned init = rand() % kMaxVal;
+    unsigned int dict_offset_init[kRows * kVec];
+
+    // initialize input data with random values
+    for (size_t i = 0; i < kRows; ++i) {
+      for (size_t k = 0; k < kVec; ++k) {
+        dict_offset_init[i * kVec + k] = rand() % kMaxVal;
+      }
+    }
+
+    // compute the golden result
+    unsigned golden_result = GoldenRun(init, dict_offset_init);
+
+    // run the kernel with 'singlepump' memory attribute
+    unsigned result_sp = RunKernel<1>(init, dict_offset_init);
+
+    if (!(result_sp == golden_result)) {
+      passed = false;
+      std::cout << "  Test#" << j
+                << ": mismatch: " << result_sp << " != " << golden_result
+                << " (result_sp != golden_result)\n";
+    }
+
+    // run the kernel with 'doublepump' memory attribute
+    unsigned result_dp = RunKernel<2>(init, dict_offset_init);
+        
+    if (!(result_dp == golden_result)) {
+      passed = false;
+      std::cout << "  Test#" << j
+                << ": mismatch: " << result_dp << " != " << golden_result
+                << " (result_dp != golden_result)\n";
+    }
+
+    // run the kernel with no memory attributes
+    unsigned result_na = RunKernel<0>(init, dict_offset_init);
+        
+    if (!(result_na == golden_result)) {
+      passed = false;
+      std::cout << "  Test#" << j
+                << ": mismatch: " << result_na << " != " << golden_result
+                << " (result_na != golden_result)\n";
+    }   
+  }
+
+  if (passed) {
+    std::cout << "PASSED: all kernel results are correct.\n";
+  } else {
+    std::cout << "FAILED\n";
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/CMakeLists.txt
new file mode 100755
index 0000000000..63f680d7fd
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(Pipes)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/README.md
new file mode 100755
index 0000000000..f168deb09f
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/README.md
@@ -0,0 +1,250 @@
+# Data Transfers Using Pipes
+This FPGA tutorial shows how to use pipes to transfer data between kernels.
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. 
+
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               | The basics of the of DPC++ pipes extension for FPGA<br> How to declare and use pipes in a DPC++ program
+| Time to complete                  | 15 minutes
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+This tutorial demonstrates how a kernel in a DPC++ FPGA program transfers
+data to or from another kernel using the pipe abstraction.
+
+### Definition of a Pipe
+The primary goal of pipes is to allow concurrent execution of kernels that need
+to exchange data.
+
+A pipe is a FIFO data structure connecting two endpoints that communicate
+using the pipe's `read` and `write` operations. An endpoint can be either a kernel
+or an external I/O on the FPGA. Therefore, there are three types of pipes: 
+* kernel-kernel
+* kernel-I/O 
+* I/O-kernel 
+
+This tutorial focuses on kernel-kernel pipes, but
+the concepts discussed here apply to other kinds of pipes as well.
+
+The `read` and `write` operations have two variants: 
+* Blocking variant: Blocking operations may not return immediately, but are always successful.
+* Non-blocking variant: Non-blocking operations take an extra boolean parameter
+that is set to `true` if the operation happened successfully. 
+
+Data flows in a single direction inside pipes. In other words, for a pipe `P`
+and two kernels using `P`, one of the kernels is exclusively going to perform
+`write` to `P` while the other kernel is exclusively going to perform `read` from
+`P`. Bidirectional communication can be achieved using two pipes.
+
+Each pipe has a configurable `capacity` parameter describing the number of `write`
+operations that may be performed without any `read` operations being performed. For example,
+consider a pipe `P` with capacity 3, and two kernels `K1` and `K2` using
+`P`. Assume that `K1` performed the following sequence of operations:
+
+ `write(1)`, `write(2)`, `write(3)`
+
+In this situation, the pipe is full, because three (the `capacity` of
+`P`) `write` operations were performed without any `read` operation. In this
+situation, a `read` must occur before any other `write` is allowed.
+
+If a `write` is attempted to a full pipe, one of two behaviors occur:
+
+  * If the operation is non-blocking, it returns immediately and its
+  boolean parameter is set to `false`. The `write` does not have any effect.
+  * If the operation is blocking, it does not return until a `read` is
+  performed by the other endpoint. Once the `read` is performed, the `write`
+  takes place.
+
+The blocking and non-blocking `read` operations have analogous behaviors when
+the pipe is empty.
+
+### Defining a Pipe in DPC++
+
+In DPC++, pipes are defined as a class with static members. To declare a pipe that
+transfers integer data and has  `capacity=4`, use a type alias:
+
+```c++
+using ProducerToConsumerPipe = pipe<  // Defined in the DPC++ headers.
+  class ProducerConsumerPipe,         // An identifier for the pipe.
+  int,                                // The type of data in the pipe.
+  4>;                                 // The capacity of the pipe.
+```
+
+The `class ProducerToConsumerPipe` template parameter is important to the
+uniqueness of the pipe. This class need not be defined, but must be distinct
+for each pipe. Consider another type alias with the exact same parameters:
+
+```c++
+using ProducerToConsumerPipe2 = pipe<  // Defined in the DPC++ headers.
+  class ProducerConsumerPipe,          // An identifier for the pipe.
+  int,                                 // The type of data in the pipe.
+  4>;                                  // The capacity of the pipe.
+```
+
+The uniqueness of a pipe is derived from a combination of all three template
+parameters. Since `ProducerToConsumerPipe` and `ProducerToConsumerPipe2` have
+the same template parameters, they define the same pipe.
+
+### Using a Pipe in DPC++
+
+This code sample defines a `Consumer` and a `Producer` kernel connected
+by the pipe `ProducerToConsumerPipe`. Kernels use the
+`ProducerToConsumerPipe::write` and `ProducerToConsumerPipe::read` methods for
+communication.
+
+The `Producer` kernel reads integers from the global memory and writes those integers
+into `ProducerToConsumerPipe`, as shown in the following code snippet:
+
+```c++
+void Producer(queue &q, buffer<int, 1> &input_buffer) {
+  std::cout << "Enqueuing producer...\n";
+
+  auto e = q.submit([&](handler &h) {
+    auto input_accessor = input_buffer.get_access<access::mode::read>(h);
+    auto num_elements = input_buffer.get_count();
+
+    h.single_task<ProducerTutorial>([=]() {
+      for (size_t i = 0; i < num_elements; ++i) {
+        ProducerToConsumerPipe::write(input_accessor[i]);
+      }
+    });
+  });
+}
+```
+
+The `Consumer` kernel reads integers from `ProducerToConsumerPipe`, processes
+the integers (`ConsumerWork(i)`), and writes the result into the global memory.
+
+```c++
+void Consumer(queue &q, buffer<int, 1> &output_buffer) {
+  std::cout << "Enqueuing consumer...\n";
+
+  auto e = q.submit([&](handler &h) {
+    auto output_accessor = output_buffer.get_access<access::mode::discard_write>(h);
+    size_t num_elements = output_buffer.get_count();
+
+    h.single_task<ConsumerTutorial>([=]() {
+      for (size_t i = 0; i < num_elements; ++i) {
+        int input = ProducerToConsumerPipe::read();
+        int answer = ConsumerWork(input);
+        output_accessor[i] = answer;
+      }
+    });
+  });
+}
+```
+
+**NOTE:** The `read` and `write` operations used are blocking. If
+`ConsumerWork` is an expensive operation, then `Producer` might fill
+`ProducerToConsumerPipe` faster than `Consumer` can read from it, causing
+`Producer` to block occasionally.
+
+## Key Concepts
+* The basics of the of DPC++ pipes extension for FPGA
+* How to declare and use pipes in a DPC++ program
+
+## License  
+This code sample is licensed under MIT license.
+
+## Building the `pipes` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (FPGA) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the optimization report: 
+     ```
+     make report
+     ``` 
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/pipes.fpga.tar.gz" download>here</a>.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. 
+
+1. Enter the source file directory.
+   ```
+   cd src
+   ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      ninja fpga_emu
+      ```
+
+   * Generate the optimization report:
+
+     ```
+     ninja report
+     ```
+     If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+     ```
+     ninja report_s10_pac
+     ```     
+   * Compiling for FPGA hardware is not yet supported on Windows.
+ 
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Examining the Reports
+Locate `report.html` in the `pipes_report.prj/reports/` or `pipes_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+Navigate to the "System Viewer" to visualize the structure of the kernel system. Identify the pipe connecting the two kernels.
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./pipes.fpga_emu     (Linux)
+     pipes.fpga_emu.exe   (Windows)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./pipes.fpga         (Linux)
+     ```
+
+### Example of Output
+```
+Input Array Size:  1024
+Enqueuing producer...
+Enqueuing consumer...
+PASSED: The results are correct
+```
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/pipes.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/pipes.sln
new file mode 100755
index 0000000000..aa652a2f4b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/pipes.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "pipes", "pipes.vcxproj", "{BE9E5E70-F644-4119-9A1F-E2B75C85B9E2}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{BE9E5E70-F644-4119-9A1F-E2B75C85B9E2}.Debug|x64.ActiveCfg = Debug|x64
+		{BE9E5E70-F644-4119-9A1F-E2B75C85B9E2}.Debug|x64.Build.0 = Debug|x64
+		{BE9E5E70-F644-4119-9A1F-E2B75C85B9E2}.Release|x64.ActiveCfg = Release|x64
+		{BE9E5E70-F644-4119-9A1F-E2B75C85B9E2}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {47B77939-C7AE-44EC-AD38-EF8459A9C41A}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/pipes.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/pipes.vcxproj
new file mode 100755
index 0000000000..7bae18102e
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/pipes.vcxproj
@@ -0,0 +1,160 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\pipes.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{be9e5e70-f644-4119-9a1f-e2b75c85b9e2}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>pipes</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)pipes.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)pipes.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/sample.json
new file mode 100755
index 0000000000..1c67d49d41
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/sample.json
@@ -0,0 +1,51 @@
+{
+  "guid": "58CF1ABA-5D08-40B7-ACC2-5CB904261413",
+  "name": "Data Transfers Using Pipes",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "How to use pipes to transfer data between kernels on an FPGA",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "targetDevice": ["FPGA"],
+  "builder": ["ide", "cmake"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./pipes.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "pipes.fpga_emu.exe"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/CMakeLists.txt
new file mode 100755
index 0000000000..f8a80a7e68
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/CMakeLists.txt
@@ -0,0 +1,89 @@
+set(SOURCE_FILE pipes.cpp)
+set(TARGET_NAME pipes)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga")
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+# fpga emulator
+if(WIN32)
+      set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+      add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+      separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+      add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+                   COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+                   DEPENDS ${SOURCE_FILE})
+
+else()
+      add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+      add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+      set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+      set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpga
+if(WIN32)
+    add_custom_target(fpga
+                  COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+    set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS})
+    set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS})
+endif()
+
+# report
+
+if(WIN32)
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                       DEPENDS ${SOURCE_FILE})
+
+else()
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                       DEPENDS ${SOURCE_FILE})
+endif()
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}.fpga_emu
+                  DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/build.ninja
new file mode 100755
index 0000000000..a45c4c511c
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/build.ninja
@@ -0,0 +1,30 @@
+source_file = pipes.cpp
+target_name = pipes
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware -std=c++14
+emulator_flags = -fintelfpga -DFPGA_EMULATOR -std=c++14
+
+rule build_fpga_emu
+  command = dpcpp /GX ${emulator_flags} $in -o $out
+
+rule gen_report
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/pipes.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/pipes.cpp
new file mode 100755
index 0000000000..71de729c3c
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/pipes/src/pipes.cpp
@@ -0,0 +1,135 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include "dpc_common.hpp"
+
+
+using namespace sycl;
+
+using ProducerToConsumerPipe = intel::pipe<  // Defined in the SYCL headers.
+    class ProducerConsumerPipe,              // An identifier for the pipe.
+    int,                                     // The type of data in the pipe.
+    4>;                                      // The capacity of the pipe.
+
+// Forward declare the kernel names
+// (This will become unnecessary in a future compiler version.)
+class ProducerTutorial;
+class ConsumerTutorial;
+
+// The Producer kernel reads data from a SYCL buffer and writes it to
+// a pipe. This transfers the input data from the host to the Consumer kernel
+// that is running concurrently.
+void Producer(queue &q, buffer<int, 1> &input_buffer) {
+  std::cout << "Enqueuing producer...\n";
+
+  auto e = q.submit([&](handler &h) {
+    auto input_accessor = input_buffer.get_access<access::mode::read>(h);
+    size_t num_elements = input_buffer.get_count();
+
+    h.single_task<ProducerTutorial>([=]() {
+      for (size_t i = 0; i < num_elements; ++i) {
+        ProducerToConsumerPipe::write(input_accessor[i]);
+      }
+    });
+  });
+}
+
+
+// An example of some simple work, to be done by the Consumer kernel
+// on the input data
+int ConsumerWork(int i) { return i * i; }
+
+// The Consumer kernel reads data from the pipe, performs some work
+// on the data, and writes the results to an output buffer
+void Consumer(queue &q, buffer<int, 1> &out_buf) {
+  std::cout << "Enqueuing consumer...\n";
+
+  auto e = q.submit([&](handler &h) {
+    auto out_accessor = out_buf.get_access<access::mode::discard_write>(h);
+    size_t num_elements = out_buf.get_count();
+
+    h.single_task<ConsumerTutorial>([=]() {
+      for (size_t i = 0; i < num_elements; ++i) {
+        int input = ProducerToConsumerPipe::read();
+        int answer = ConsumerWork(input);
+        out_accessor[i] = answer;
+      }
+    });
+  });
+}
+
+int main(int argc, char *argv[]) {
+  size_t array_size = (1 << 10);
+
+  if (argc > 1) {
+    std::string option(argv[1]);
+    if (option == "-h" || option == "--help") {
+      std::cout << "Usage: \n<executable> <data size>\n\nFAILED\n";
+      return 1;
+    } else {
+      array_size = std::stoi(option);
+    }
+  }
+
+  std::cout << "Input Array Size:  " << array_size << "\n";
+
+  std::vector<int> producer_input(array_size, -1);
+  std::vector<int> consumer_output(array_size, -1);
+
+  // Initialize the input data
+  for (size_t i = 0; i < array_size; i++)
+    producer_input[i] = i;
+
+#if defined(FPGA_EMULATOR)
+  intel::fpga_emulator_selector device_selector;
+#else
+  intel::fpga_selector device_selector;
+#endif
+
+  try {
+    queue q(device_selector, dpc_common::exception_handler);
+
+    buffer producer_buffer(producer_input);
+    // Use verbose SYCL 1.2 syntax for the output buffer.
+    // (This will become unnecessary in a future compiler version.)
+    buffer<int, 1> consumer_buffer(consumer_output.data(), array_size);
+
+    // Run the two kernels concurrently. The Producer kernel sends
+    // data via a pipe to the Consumer kernel.
+    Producer(q, producer_buffer);
+    Consumer(q, consumer_buffer);
+
+  } catch (sycl::exception const &e) {
+    // Catches exceptions in the host code
+    std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+
+    // Most likely the runtime couldn't find FPGA hardware!
+    if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+      std::cout << "If you are targeting an FPGA, please ensure that your "
+                   "system has a correctly configured FPGA board.\n";
+      std::cout << "If you are targeting the FPGA emulator, compile with "
+                   "-DFPGA_EMULATOR.\n";
+    }
+    std::terminate();
+  }
+
+  // Verify result
+  for (size_t i = 0; i < array_size; i++) {
+    if (consumer_output[i] != ConsumerWork(producer_input[i])) {
+      std::cout << "input = " << producer_input[i]
+                << " expected: " << ConsumerWork(producer_input[i])
+                << " got: " << consumer_output[i] << "\n";
+      std::cout << "FAILED: The results are incorrect\n";
+      return 1;
+    }
+  }
+  std::cout << "PASSED: The results are correct\n";
+  return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/CMakeLists.txt
new file mode 100755
index 0000000000..c18e7e73ed
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(SpeculatedIterations)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/README.md
new file mode 100755
index 0000000000..bd1d9359bf
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/README.md
@@ -0,0 +1,174 @@
+
+# Speculated Iterations of a Loop
+This FPGA tutorial demonstrates applying the `speculated_iterations` attribute to a loop in a task kernel to enable more efficient loop pipelining.
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. 
+
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               |  What the `speculated_iterations` attribute does <br> How to apply the `speculated_iterations` attribute to loops in your program <br> How to determine the optimal number of speculated iterations
+| Time to complete                  | 15 minutes
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+Loop speculation is an advanced loop pipelining optimization technique. It enables loop iterations to be initiated before determining whether they should have been initiated. "Speculated iterations" are those iterations that launch before the exit condition computation has completed. This is beneficial when the computation of the exit condition is preventing effective loop pipelining.
+
+The `speculated_iterations` attribute is a loop attribute that enables you to directly control the number of speculated iterations for a loop.  The attribute  `[[intelfpga::speculated_iterations(N)]]` takes an integer argument `N` to specify the permissible number of iterations to speculate.
+
+### Simple example
+```
+  [[intelfpga::speculated_iterations(1)]]
+  while (sycl::log10(x) < N) {
+      x += 1;
+  }
+  dst[0] = x;
+```
+The loop in this example will have one speculated iteration.
+### Operations with side effects
+When launching speculated iterations, operations with side-effects (such as stores to memory) must be predicated by the exit condition to ensure functional correctness. For this reason, operations with side-effects must be scheduled until after the exit condition has been computed.
+
+### Optimizing the number of speculated iterations
+Loop speculation is beneficial when the computation of the loop exit condition is the bottleneck preventing the compiler from achieving a smaller initiation interval (II). In such instances, increasing the number of speculated iterations often improves the II.  Note that this may also uncover additional bottlenecks preventing the further optimization of the loop.
+
+However, adding speculated iterations is not without cost. They introduce overhead in nested loops, reducing overall loop occupancy. Consider the code snippet below:
+```c++
+for (size_t i = 0; i < kMany; ++i) {
+  // The compiler may automatically infer speculated iterations 
+  for (size_t j = 0; complex_exit_condition(j); ++j) {
+    output[i,j] = some_function(input[i,j]);
+  }
+}
+```
+The *i+1*th  invocation of the inner loop cannot begin until all real and speculated iterations of its *i*th invocation have completed. This overhead is negligible if the number of speculated iterations is much less than the number of real iterations. However, when the inner loop's trip count is small on average, the overhead becomes non-negligible and the speculated iterations can become detrimental to throughput. In such circumstances, the `speculated_iterations` attribute can be used to *reduce* the number of speculated iterations chosen by the compiler's heuristics. 
+
+In both increasing and decreasing cases, some experimentation is usually necessary. Choosing too new speculated iterations can increase the II because multiple cycles are required to evaluate the exit condition. Choosing too many speculated iterations creates unneeded "dead space" between sequential invocations of an inner loop.
+
+### Tutorial example
+In the tutorial design's kernel, the exit condition of the loop involves a logarithm and a compare operation. This complex exit condition prevents the loop from achieving ```II=1```. 
+
+The design enqueues variants of the kernel with 0, 10 and 27 speculated iterations respectively to demonstrate the effect of the `speculated_iterations` attribute on the Intel® PAC with Intel Arria® 10 GX FPGA. Different numbers are chosen for the Intel® PAC with Intel Stratix® 10 SX FPGA accordingly.
+
+## Key Concepts
+* Description of the `speculated_iterations` attribute. 
+* How to apply the `speculated_iterations` attribute to loops in your program.
+* Optimizing the number of speculated iterations.
+
+## License  
+This code sample is licensed under MIT license.
+
+
+## Building the `speculated_iterations` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the optimization report: 
+     ```
+     make report
+     ``` 
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/speculated_iterations.fpga.tar.gz" download>here</a>.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. 
+
+1. Enter the source file directory.
+   ```
+   cd src
+   ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      ninja fpga_emu
+      ```
+
+   * Generate the optimization report:
+
+     ```
+     ninja report
+     ```
+     If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+     ```
+     ninja report_s10_pac
+     ```     
+   * Compiling for FPGA hardware is not yet supported on Windows.
+ 
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+## Examining the Reports
+Locate `report.html` in the `speculated_iterations_report.prj/reports/` or `speculated_iterations_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+In the "Loop Analysis" section of the report, check the II of the loop in each version of the kernel. Use the kernel with 0 speculated iteration as a base version, check its loop II as a hint for the ideal number for speculated iterations. The information shown below is from compiling on the Intel® PAC with Intel Arria® 10 GX FPGA.
+
+* When the number of  `speculated iterations` is set to 0, the loop II is 27.
+* Setting the `speculated iterations` to 27 yielded an II of 1.
+* Setting the `speculated iterations` to an intermediate value of 10 results in an II of 3. 
+
+
+These results make sense when you recall that the loop exit computation has a latency of 27 cycles (suggested by looking at the loop II with 0 speculation). With no speculation, a new iteration can only be launched every 27 cycles. Increasing the speculation to 27 enables a new iteration to launch every cycle. Reducing the speculation to 10 results in an II of 3 because 10 speculated iteration multipled by 3 cycles between iterations leaves 30 cycles in which to compute the exit condition, sufficient to cover the 27-cycle exit condition. 
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./speculated iterations.fpga_emu     (Linux)
+     speculated iterations.fpga_emu.exe   (Windows)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./speculated iterations.fpga         (Linux)
+     ```
+
+### Example of Output
+```
+Speculated Iterations: 0 -- kernel time: 8564.98 ms
+Performance for kernel with 0 speculated iterations: 11675 MFLOPs
+Speculated Iterations: 10 -- kernel time: 952 ms
+Performance for kernel with 10 speculated iterations: 105076 MFLOPs
+Speculated Iterations: 27 -- kernel time: 317 ms
+Performance for kernel with 27 speculated iterations: 315181 MFLOPs
+PASSED: The results are correct
+```
+The execution time and throughput for each kernel is displayed. 
+
+Note that this performance difference will be apparent only when running on FPGA hardware. The emulator, while useful for verifying functionality, will generally not reflect differences in performance.
+     
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/sample.json
new file mode 100755
index 0000000000..28f98e4a48
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/sample.json
@@ -0,0 +1,51 @@
+{
+  "guid": "66A57127-1F8D-4769-8CCB-16ECC56A446F",
+  "name": "Speculated Iterations of a Loop",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "FPGA tutorial demonstrating the speculated_iterations attribute",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "targetDevice": ["FPGA"],
+  "builder": ["ide", "cmake"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./speculated_iterations.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "speculated_iterations.fpga_emu.exe"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/speculated_iterations.sln b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/speculated_iterations.sln
new file mode 100755
index 0000000000..7155665db9
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/speculated_iterations.sln
@@ -0,0 +1,25 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "speculated_iterations", "speculated_iterations.vcxproj", "{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Debug|x64.ActiveCfg = Debug|x64
+		{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Debug|x64.Build.0 = Debug|x64
+		{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Release|x64.ActiveCfg = Release|x64
+		{CF6A576B-665D-4F24-BB62-0DAE7A7B3C64}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {92BEFAAB-0365-4E5A-9C4A-E50AB49B2A6B}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/speculated_iterations.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/speculated_iterations.vcxproj
new file mode 100755
index 0000000000..2ec6e32238
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/speculated_iterations.vcxproj
@@ -0,0 +1,161 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="src\speculated_iterations.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{cf6a576b-665d-4f24-bb62-0dae7a7b3c64}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>speculated_iterations</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)speculated_iterations.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+    <Manifest />
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)speculated_iterations.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/CMakeLists.txt
new file mode 100755
index 0000000000..5140f431a0
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/CMakeLists.txt
@@ -0,0 +1,97 @@
+set(SOURCE_FILE speculated_iterations.cpp)
+set(TARGET_NAME speculated_iterations)
+
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+# This tutorial needs to know which FPGA we are targetting to decide how many speculated_iterations to use
+IF (_FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    SET(FPGA_BOARD_MACRO "-DA10")
+ELSEIF(_FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    SET(FPGA_BOARD_MACRO "-DS10")
+ELSE()
+    MESSAGE(FATAL_ERROR "Unknown board!")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga ${FPGA_BOARD_MACRO}")
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${FPGA_BOARD_MACRO} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR ${FPGA_BOARD_MACRO}")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+# fpga emulator
+if(WIN32)
+    set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+    add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+    separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+    add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+                   COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+                   DEPENDS ${SOURCE_FILE})
+else()
+    add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+    add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpga
+if(WIN32)
+    add_custom_target(fpga
+                  COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+    set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS})
+    set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS})
+endif()
+
+# generate report
+if(WIN32)
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                   DEPENDS ${SOURCE_FILE})
+
+else()
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                   DEPENDS ${SOURCE_FILE})
+endif()
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}.fpga_emu
+                  DEPENDS ${TARGET_NAME}.fpga_emu)
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/build.ninja
new file mode 100755
index 0000000000..e8c5f7f77e
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/build.ninja
@@ -0,0 +1,32 @@
+source_file = speculated_iterations.cpp
+target_name = speculated_iterations
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+  # For the emulator, it makes no difference whether this sample is compiled with -DA10 or -DS10
+  command = dpcpp /GX ${emulator_flags} -DA10 $in -o $out
+
+rule gen_report
+  command = dpcpp /GX ${hardware_flags} -DA10 -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+  command = dpcpp /GX ${hardware_flags} -DS10 -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
+
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/speculated_iterations.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/speculated_iterations.cpp
new file mode 100755
index 0000000000..f689a6eb03
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Features/speculated_iterations/src/speculated_iterations.cpp
@@ -0,0 +1,150 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <array>
+#include <iomanip>
+#include <iostream>
+#include <type_traits>
+#include "dpc_common.hpp"
+
+// Use smaller values if run on the emulator to keep the CPU runtime reasonable
+// Use the largest possible int values on the FPGA to show the difference in
+// performance with and without speculated_iterations
+#if defined(FPGA_EMULATOR)
+constexpr float kUpper = 3.0f;
+constexpr size_t kExpectedIterations = 1e3;
+#else
+constexpr float kUpper = 8.0f;
+constexpr size_t kExpectedIterations = 1e8;
+#endif
+
+using namespace sycl;
+
+// This is the class used to name the kernel for the runtime.
+// This must be done when the kernel is expressed as a lambda.
+template <int N> class KernelCompute;
+
+template <int spec_iter>
+void ComplexExit(const device_selector &selector, float bound, int &res) {
+  double kernel_time_ms = 0.0;
+  try {
+    // create the device queue with profiling enabled
+    auto prop_list = property_list{property::queue::enable_profiling()};
+    queue q(selector, dpc_common::exception_handler, prop_list);
+
+    // The scalar inputs are passed to the kernel using the lambda capture,
+    // but a SYCL buffer must be used to return a scalar from the kernel.
+    buffer<int, 1> buffer_res(&res, 1);
+
+    event e = q.submit([&](handler &h) {
+      auto accessor_res = buffer_res.get_access<access::mode::discard_write>(h);
+      
+      h.single_task<class KernelCompute<spec_iter>>([=]() {
+        int x = 1;
+        
+        // Computing the exit condition of this loop is a complex operation.
+        // Since the value of var is not known at compile time, the loop
+        // trip count is variable and the exit condition must be evaluated at
+        // each iteration.
+        [[intelfpga::speculated_iterations(spec_iter)]] 
+        while (sycl::log10((float)(x)) < bound) {
+          x++;
+        }
+        
+        accessor_res[0] = x;
+      });
+    });
+
+    // get the kernel time in milliseconds 
+    // this excludes memory transfer and queuing overhead
+    double startk =
+        e.template get_profiling_info<info::event_profiling::command_start>();
+    double endk =
+        e.template get_profiling_info<info::event_profiling::command_end>();
+    kernel_time_ms = (endk - startk) * 1e-6;
+
+  } catch (exception const &exc) {
+    std::cout << "Caught synchronous SYCL exception:\n" << exc.what() << "\n";
+    if (exc.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+      std::cout << "If you are targeting an FPGA, please ensure that your "
+                   "system has a correctly configured FPGA board.\n";
+      std::cout << "If you are targeting the FPGA emulator, compile with "
+                   "-DFPGA_EMULATOR.\n"; 
+    }
+    std::terminate();
+  }
+
+  // MFLOPs = mega floating point operations per second
+  double mflops = (double)(kExpectedIterations) / kernel_time_ms;
+
+  std::cout << "Speculated Iterations: " << spec_iter
+            << " -- kernel time: " << kernel_time_ms << " ms\n";
+
+  std::cout << std::fixed << std::setprecision(0)
+            << "Performance for kernel with " << spec_iter
+            << " speculated iterations: " << mflops << " MFLOPs\n";
+}
+
+int main(int argc, char *argv[]) {
+#if defined(FPGA_EMULATOR)
+  intel::fpga_emulator_selector selector;
+#else
+  intel::fpga_selector selector;
+#endif
+
+  float bound = kUpper;
+
+  // We don't want "bound" to be a compile-time known constant value
+  if (argc > 1) {
+    std::string option(argv[1]);
+    bound = std::stoi(option);
+  }
+  
+  // result variables
+  int r0, r1, r2;
+
+// Choose the number of speculated iterations based on the FPGA board selected.
+// This reflects compute latency differences on different hardware architectures,
+// and is a low-level optimization.
+#if defined(A10)
+  ComplexExit<0>(selector, bound, r0);
+  ComplexExit<10>(selector, bound, r1);
+  ComplexExit<27>(selector, bound, r2);
+#elif defined(S10)
+  ComplexExit<0>(selector, bound, r0);
+  ComplexExit<10>(selector, bound, r1);
+  ComplexExit<54>(selector, bound, r2);
+#else
+  std::static_assert(false, "Invalid FPGA board macro");
+#endif
+
+  bool passed = true;
+
+  if (std::fabs(std::log10(r0) - bound) > 1e-5) {
+    std::cout << "Test 0 result mismatch " << std::log10(r0)
+              << " not within 0.00001 of " << bound << "\n";
+    passed = false;
+  }
+
+  if (std::fabs(std::log10(r1) - bound) > 1e-5) {
+    std::cout << "Test 1 result mismatch " << std::log10(r1)
+              << " not within 0.00001 of " << bound << "\n";
+    passed = false;
+  }
+  
+  if (std::fabs(std::log10(r2) - bound) > 1e-5) {
+    std::cout << "Test 2 result mismatch " << std::log10(r2)
+              << " not within 0.00001 of " << bound << "\n";
+    passed = false;
+  }
+  
+  
+  std::cout << (passed ? "PASSED: The results are correct" : "FAILED") << "\n";
+
+  return passed ? 0 : -1;
+}
+
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/CMakeLists.txt
new file mode 100755
index 0000000000..ec7f83f6b3
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(DeviceLink)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/README.md b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/README.md
new file mode 100755
index 0000000000..e2991414b9
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/README.md
@@ -0,0 +1,203 @@
+
+# Separating Host and Device Code Compilation
+This FPGA tutorial demonstrates how to separate the compilation of a program's host code and device code to save development time.
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. 
+
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               | Why to separate host and device code compilation in your FPGA project <br> How to use the `-reuse-exe` and device link methods <br> Which method to choose for your project
+| Time to complete                  | 15 minutes
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+Intel® oneAPI DPC++ Compiler (Beta) only supports ahead-of-time (AoT) compilation for FPGA, which means that an FPGA device image is generated at compile time. The FPGA device image generation process can take hours to complete. If you make a change that is exclusive to the host code, it is more efficient to recompile your host code only, re-using the existing FPGA device image and circumventing the time-consuming device compilation process.
+
+The Intel® oneAPI DPC++ Compiler (Beta) provides two different mechanisms to separate device code and host code compilation.
+* Passing `-reuse-exe=<exe_name>` flag to `dpcpp` instructs the compiler to attempt to reuse the existing FPGA device image.
+* The more explicit "device link" method requires you to separate the host and device code into separate files. When a code change only applies to host-only files, an FPGA device image is not regenerated. 
+
+This tutorial explains both mechanisms and the pros and cons of each. The included code sample demonstrates the device link method.
+
+### Using the `-reuse-exe` flag
+
+If the device code and options affecting the device have not changed since the previous compilation, passing the `-reuse-exe=<exe_name>` flag to `dpcpp` instructs the compiler to extract the compiled FPGA binary from the existing executable and package it into the new executable, saving the device compilation time.
+
+**Sample usage:**
+
+```
+# Initial compilation
+dpcpp <files.cpp> -o out.fpga -Xshardware -fintelfpga 
+```
+The initial compilation generates an FPGA device image, which takes several hours. Now, make some changes to the host code.
+```
+# Subsequent recompilation
+dpcpp <files.cpp> -o out.fpga -reuse-exe=out.fpga -Xshardware -fintelfpga 
+```
+If `out.fpga` does not exist, `-reuse-exe` is ignored and the FPGA device image is regenerated. This will always be the case the first time a project is compiled.
+
+If `out.fpga` is found, the compiler verifies that no changes that affect the FPGA device code have been made since the last compilation. If so, the compiler reuses the existing FPGA binary and only the host code is recompiled. The recompilation process takes a few minutes. Note that the device code is *partially* re-compiled (the equivalent of a report flow compile) in order to check that the FPGA binary can safely be reused.
+
+### Using the device link method
+
+The program accompanying this tutorial is separated into two files, `main.cpp` and `kernel.cpp`. Only the `kernel.cpp` file contains device code. 
+
+In the normal compilation process, FPGA device image generation happens at link time. As a result, any change to either `main.cpp` or `kernel.cpp` will trigger the regeneration of an FPGA device image. 
+
+```
+# normal compile command
+dpcpp -fintelfpga main.cpp kernel.cpp -Xshardware -o link.fpga
+```
+
+The following graph depicts this compilation process:
+
+![](normal_compile.png)
+
+
+If you want to iterate on the host code and avoid long compile time for your FPGA device, consider using a device link to separate device and host compilation:
+
+```
+# device link command
+dpcpp -fintelfpga -fsycl-link=image <input files> [options]
+```
+
+The compilation is a 3-step process:
+
+1. Compile the device code: 
+
+   ```
+   dpcpp -fintelfpga -fsycl-link=image kernel.cpp -o dev_image.a -Xshardware
+   ```
+   Input files should include all source files that contain device code. This step may take several hours.
+
+
+2. Compile the host code:
+   
+   ``` 
+   dpcpp -fintelfpga main.cpp -c -o host.o
+   ```
+   Input files should include all source files that only contain host code. This takes seconds.
+
+
+3. Create the device link:
+
+   ```
+   dpcpp -fintelfpga host.o dev_image.a -o fast_recompile.fpga
+   ```
+   The input should have N (N >= 0) host object files *(.o)* and one device image file *(.a)*. This takes seconds.
+
+**NOTE:** You only need to perform steps 2 and 3 when modifying host-only files.
+
+The following graph depicts device link compilation process:
+
+![](fast_recompile.png)
+
+### Which method to use?
+Of the two methods described, `-reuse-exe` is easier to use. It also allows you to keep your host and device code as single source, which is preferred for small programs. 
+
+For larger and more complex projects, the device link method has the advantage of giving you complete control over the compiler's behavior.
+* When using `-reuse-exe`, the compiler must spend time partially recompiling and then analyzing the device code to ensure that it is unchanged. This takes several minutes for larger designs. Compiling separate files does not incur this time.
+* When using `-reuse-exe`, you may occasionally encounter a "false positive" where the compiler wrongly believes that it must recompile your device code. In a single source file, the device and host code are coupled, so certain changes to the host code can change the compiler's view of the device code. The compiler will always behave conservatively and trigger a full recompilation if it cannot prove that reusing the previous FPGA binary is safe. Compiling separate files eliminates this possibility.
+
+
+## Key Concepts
+* Why to separate host and device code compilation in your FPGA project 
+* How to use the `-reuse-exe` and device link methods
+* Which method to choose for your project
+
+## License  
+This code sample is licensed under MIT license.
+
+
+## Building the `fast_recompile` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+     **NOTE:** For the FPGA emulator target and the FPGA target, the device link method is used. 
+2. Compile the design through the generated `Makefile`. The following build targets are provided:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/fast_recompile.fpga.tar.gz" download>here</a>.
+
+### On a Windows* System
+Note: `cmake` is not yet supported on Windows. A build.ninja file is provided instead. 
+
+1. Enter the source file directory.
+   ```
+   cd src
+   ```
+
+2. Compile the design. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      ninja fpga_emu
+      ```
+      **NOTE:** For the FPGA emulator target, the device link method is used. 
+   * Generate the optimization report:
+
+     ```
+     ninja report
+     ```
+     If you are targeting Intel® PAC with Intel Stratix® 10 SX FPGA, instead use:
+     ```
+     ninja report_s10_pac
+     ```     
+   * Compiling for FPGA hardware is not yet supported on Windows.
+ 
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./fast_recompile.fpga_emu     (Linux)
+     fast_recompile.fpga_emu.exe   (Windows)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./fast_recompile.fpga         (Linux)
+     ```
+
+### Example of Output
+```
+PASSED: results are correct
+```
+### Discussion of Results
+Try modifying `main.cpp` to produce a different output message. Then, perform a host-only recompile via the device link method to see how quickly the design is recompiled.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/device_link.png b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/device_link.png
new file mode 100755
index 0000000000..18619231fa
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/device_link.png differ
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/fast_recompile.sln b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/fast_recompile.sln
new file mode 100755
index 0000000000..cf3fe19782
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/fast_recompile.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "fast_recompile", "fast_recompile.vcxproj", "{AD7020EE-30BB-496A-801E-A17F67699F38}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{AD7020EE-30BB-496A-801E-A17F67699F38}.Debug|x64.ActiveCfg = Debug|x64
+		{AD7020EE-30BB-496A-801E-A17F67699F38}.Debug|x64.Build.0 = Debug|x64
+		{AD7020EE-30BB-496A-801E-A17F67699F38}.Release|x64.ActiveCfg = Release|x64
+		{AD7020EE-30BB-496A-801E-A17F67699F38}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {73223E7A-81B2-40C4-8A0C-19D0021CFD05}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/fast_recompile.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/fast_recompile.vcxproj
new file mode 100755
index 0000000000..a01f63e62c
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/fast_recompile.vcxproj
@@ -0,0 +1,166 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{ad7020ee-30bb-496a-801e-a17f67699f38}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>fast_recompile</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>
+      </ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>
+      </ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="src\kernel.cpp" />
+    <ClCompile Include="src\main.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="src\kernel.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/normal_compile.png b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/normal_compile.png
new file mode 100755
index 0000000000..4903c6f371
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/normal_compile.png differ
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/sample.json
new file mode 100755
index 0000000000..5f703d1e6c
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/sample.json
@@ -0,0 +1,35 @@
+{
+  "guid": "1457B49A-2CD3-48E5-B3A9-753EAD2D18F7",
+  "name": "Separating Host and Device Code Compilation",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "FPGA tutorial demonstrating how to separate the compilation of host and device code to save development time.",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "targetDevice": ["FPGA"],
+  "builder": ["ide", "cmake"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./fast_recompile.fpga_emu"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "fast_recompile.fpga_emu.exe"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/CMakeLists.txt
new file mode 100755
index 0000000000..1bf5ca6de7
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/CMakeLists.txt
@@ -0,0 +1,119 @@
+set(DEVICE_SOURCE_FILE kernel.cpp)
+set(KERNEL_HEADER_FILE kernel.hpp)
+set(HOST_SOURCE_FILE main.cpp)
+set(TARGET_NAME fast_recompile)
+
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+String(STRIP "${CMAKE_EXE_LINKER_FLAGS}" CMAKE_EXE_LINKER_FLAGS)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS -fintelfpga -c)
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS -fintelfpga -DFPGA_EMULATOR -c)
+set(EMULATOR_LINK_FLAGS -fintelfpga)
+
+# fpga emulator
+if(WIN32)
+    add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+    set(HOST_EMU_OBJ "host_emu.o")
+    set(DEVICE_EMU_OBJ "dev_emu.o")
+    set(DEVICE_IMAGE_EMU_OBJ "dev_image_emu.a")
+
+    add_custom_command(OUTPUT ${HOST_EMU_OBJ} 
+                 COMMAND dpcpp ${EMULATOR_COMPILE_FLAGS}
+                 ${CMAKE_CURRENT_SOURCE_DIR}/${HOST_SOURCE_FILE} -o ${HOST_EMU_OBJ} 
+                 DEPENDS ${HOST_SOURCE_FILE} ${KERNEL_HEADER_FILE})
+
+    add_custom_command(OUTPUT ${DEVICE_EMU_OBJ} 
+                 COMMAND dpcpp ${EMULATOR_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${DEVICE_EMU_OBJ}
+                 DEPENDS ${DEVICE_SOURCE_FILE} ${KERNEL_HEADER_FILE})
+
+    add_custom_command(OUTPUT ${DEVICE_IMAGE_EMU_OBJ} 
+                 COMMAND dpcpp ${EMULATOR_LINK_FLAGS} -fsycl-link=image ${DEVICE_EMU_OBJ} -o ${DEVICE_IMAGE_EMU_OBJ} 
+                 DEPENDS ${DEVICE_EMU_OBJ})
+
+    add_custom_command(OUTPUT ${EMULATOR_TARGET}
+                 COMMAND dpcpp -fintelfpga ${HOST_EMU_OBJ} ${DEVICE_IMAGE_EMU_OBJ} -o ${CMAKE_BINARY_DIR}/${EMULATOR_TARGET} 
+                 DEPENDS ${HOST_EMU_OBJ} ${DEVICE_IMAGE_EMU_OBJ})
+else()
+    add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+    set(HOST_EMU_OBJ "host_emu.o")
+    set(DEVICE_EMU_OBJ "dev_emu.o")
+    set(DEVICE_IMAGE_EMU_OBJ "dev_image_emu.a")
+
+    add_custom_command(OUTPUT ${HOST_EMU_OBJ} 
+                 COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${EMULATOR_COMPILE_FLAGS}
+                 ${CMAKE_CURRENT_SOURCE_DIR}/${HOST_SOURCE_FILE} -o ${HOST_EMU_OBJ} 
+                 DEPENDS ${HOST_SOURCE_FILE} ${KERNEL_HEADER_FILE})
+
+    add_custom_command(OUTPUT ${DEVICE_EMU_OBJ} 
+                 COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${EMULATOR_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${DEVICE_EMU_OBJ}
+                 DEPENDS ${DEVICE_SOURCE_FILE} ${KERNEL_HEADER_FILE})
+
+    add_custom_command(OUTPUT ${DEVICE_IMAGE_EMU_OBJ} 
+                 COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${EMULATOR_LINK_FLAGS} -fsycl-link=image ${DEVICE_EMU_OBJ} -o ${DEVICE_IMAGE_EMU_OBJ} 
+                 DEPENDS ${DEVICE_EMU_OBJ})
+
+    add_custom_command(OUTPUT ${EMULATOR_TARGET}
+                 COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} -fintelfpga ${HOST_EMU_OBJ} ${DEVICE_IMAGE_EMU_OBJ} -o ${CMAKE_BINARY_DIR}/${EMULATOR_TARGET} 
+                 DEPENDS ${HOST_EMU_OBJ} ${DEVICE_IMAGE_EMU_OBJ})
+endif()
+
+# fpga
+if(WIN32)
+    add_custom_target(fpga
+            COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+    set(HOST_OBJ "host.o")
+    set(DEVICE_OBJ "dev.o")
+    set(DEVICE_IMAGE_OBJ "dev_image.a")
+
+    add_custom_command(OUTPUT ${HOST_OBJ} 
+                 COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${HOST_SOURCE_FILE} -o ${HOST_OBJ} 
+                 DEPENDS ${HOST_SOURCE_FILE} ${KERNEL_HEADER_FILE})
+
+    add_custom_command(OUTPUT ${DEVICE_OBJ} 
+                 COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_COMPILE_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/${DEVICE_SOURCE_FILE} -o ${DEVICE_OBJ} 
+                 DEPENDS ${DEVICE_SOURCE_FILE} ${KERNEL_HEADER_FILE})
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_IMAGE_OBJ} 
+                 COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link=image ${DEVICE_OBJ} -o ${DEVICE_IMAGE_OBJ} 
+                 DEPENDS ${DEVICE_OBJ})
+
+    add_custom_command(OUTPUT ${FPGA_TARGET} 
+                 COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${CMAKE_EXE_LINKER_FLAGS} -fintelfpga ${HOST_OBJ} ${DEVICE_IMAGE_OBJ} -o ${CMAKE_BINARY_DIR}/${FPGA_TARGET} 
+                 DEPENDS ${HOST_OBJ} ${DEVICE_IMAGE_OBJ})
+endif()
+
+# run
+add_custom_target(run
+            COMMAND ../${TARGET_NAME}.fpga_emu
+            DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/build.ninja
new file mode 100755
index 0000000000..ef5b645c71
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/build.ninja
@@ -0,0 +1,32 @@
+device_source_file = kernel.cpp
+host_source_file = main.cpp
+target_name = fast_recompile
+
+emulator_target = ${target_name}.fpga_emu.exe
+
+hardware_flags = -fintelfpga -Xshardware
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule parse_emu
+  command = dpcpp -c /EHcs ${emulator_flags} ${in} /Fo${out}	
+
+rule gen_image_obj
+  command = dpcpp -fintelfpga -fsycl-link=image ${in} -o ${out}
+
+rule link
+  command = dpcpp -fintelfpga  ${in} -o ${out} 
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+
+host_emu_obj = host_emu.obj
+dev_emu_obj = dev_emu.obj
+dev_image_emu_obj = dev_image_emu.a
+
+build ${host_emu_obj}: parse_emu ${host_source_file}
+
+build ${dev_emu_obj}: parse_emu ${device_source_file}
+
+build ${dev_image_emu_obj}: gen_image_obj ${dev_emu_obj}
+
+build ${emulator_target}: link ${host_emu_obj} ${dev_image_emu_obj}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/kernel.cpp b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/kernel.cpp
new file mode 100755
index 0000000000..680da15c67
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/kernel.cpp
@@ -0,0 +1,70 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include "dpc_common.hpp"
+
+#include "kernel.hpp"
+
+// Forward declaration of the kernel name
+// (This will become unnecessary in a future compiler version.)
+class VectorAdd;
+
+void RunKernel(std::vector<float> &vec_a, std::vector<float> &vec_b,
+               std::vector<float> &vec_r) {
+
+  // Select either the FPGA emulator or FPGA device
+#if defined(FPGA_EMULATOR)
+  intel::fpga_emulator_selector device_selector;
+#else
+  intel::fpga_selector device_selector;
+#endif
+
+  try {
+
+    // Create a queue bound to the chosen device.
+    // If the device is unavailable, a SYCL runtime exception is thrown.
+    queue q(device_selector, dpc_common::exception_handler);
+
+    // Print out the device information.
+    std::cout << "Running on device: "
+              << q.get_device().get_info<info::device::name>() << "\n";
+
+    // Device buffers
+    buffer device_a(vec_a);
+    buffer device_b(vec_b);
+    // Use verbose SYCL 1.2 syntax for the output buffer.
+    // (This will become unnecessary in a future compiler version.)
+    buffer<float, 1> device_r(vec_r.data(), kArraySize);
+
+    q.submit([&](handler &h) {
+      // Data accessors
+      auto a = device_a.get_access<access::mode::read>(h);
+      auto b = device_b.get_access<access::mode::read>(h);
+      auto r = device_r.get_access<access::mode::discard_write>(h);
+
+      // Kernel executes with pipeline parallelism on the FPGA.
+      // Use kernel_args_restrict to specify that a, b, and r do not alias.
+      h.single_task<VectorAdd>([=]() [[intel::kernel_args_restrict]] {
+        for (size_t i = 0; i < kArraySize; ++i) {
+          r[i] = a[i] + b[i];
+        }
+      });
+    });
+
+  } catch (sycl::exception const &e) {
+    // Catches exceptions in the host code
+    std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+
+    // Most likely the runtime couldn't find FPGA hardware!
+    if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+      std::cout << "If you are targeting an FPGA, please ensure that your "
+                   "system has a correctly configured FPGA board.\n";
+      std::cout << "If you are targeting the FPGA emulator, compile with "
+                   "-DFPGA_EMULATOR.\n";
+    }
+    std::terminate();
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/kernel.hpp b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/kernel.hpp
new file mode 100755
index 0000000000..b36fdb9be1
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/kernel.hpp
@@ -0,0 +1,16 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+using namespace sycl;
+
+// tolerance used in floating point comparisons
+constexpr float kTol = 0.001;
+
+// array size of vectors a, b and c
+constexpr size_t kArraySize = 32;
+
+void RunKernel(std::vector<float> &vec_a, std::vector<float> &vec_b,
+               std::vector<float> &vec_r);
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/main.cpp b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/main.cpp
new file mode 100755
index 0000000000..2d001961d1
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fast_recompile/src/main.cpp
@@ -0,0 +1,48 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#include <CL/sycl.hpp>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "kernel.hpp"
+
+using namespace sycl;
+
+int main() {
+  std::vector<float> vec_a(kArraySize);
+  std::vector<float> vec_b(kArraySize);
+  std::vector<float> vec_r(kArraySize);
+
+  // Fill vectors a and b with random float values
+  for (size_t i = 0; i < kArraySize; i++) {
+    vec_a[i] = rand() / (float)RAND_MAX;
+    vec_b[i] = rand() / (float)RAND_MAX;
+  }
+
+  // The definition of this function is in a different compilation unit,
+  // so host and device code can be separately compiled.
+  RunKernel(vec_a, vec_b, vec_r);
+
+  // Test the results
+  size_t correct = 0;
+  for (size_t i = 0; i < kArraySize; i++) {
+    float tmp = vec_a[i] + vec_b[i] - vec_r[i];
+    if (tmp * tmp < kTol * kTol) {
+      correct++;
+    }
+  }
+
+  // Summarize results
+  if (correct == kArraySize) {
+    std::cout << "PASSED: results are correct\n";
+  } else {
+    std::cout << "FAILED: results are incorrect\n";
+  }
+
+  return !(correct == kArraySize);
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/CMakeLists.txt
new file mode 100755
index 0000000000..0ac5b4f877
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+
+cmake_minimum_required (VERSION 2.8)
+
+project(CompileFlow)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
+
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/README.md b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/README.md
new file mode 100755
index 0000000000..2ddfd32e7a
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/README.md
@@ -0,0 +1,193 @@
+# Compiling DPC++ for FPGA
+This FPGA tutorial introduces how to compile DPC++ for FPGA through a simple vector addition example. 
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide) provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. 
+
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04; Windows* 10
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit
+| What you will learn               | How and why compiling DPC++ to FPGA differs from CPU or GPU <br> FPGA device image types and when to use them <br> The compile flags used to target FPGA
+| Time to complete                  | 15 minutes
+
+_Notice: Limited support in Windows*; compiling for FPGA hardware is not supported in Windows*_
+
+## Purpose
+Field-programmable gate arrays (FPGAs) are configurable integrated circuits that can be programmed to implement arbitrary circuit topologies. Classified as *spatial* compute architectures, FPGAs differ significantly from fixed Instruction Set Architecture (ISA) devices like CPUs and GPUs, and offer a different set of optimization trade-offs from these traditional accelerator devices.
+
+While DPC++ can be compiled for CPU, GPU or for FPGA, the process for compiling to FPGA is somewhat different than for CPU or GPU. This tutorial motivates these differences and explains how to compile a "Hello World"-style vector addition kernel for FPGA.
+
+### Why is FPGA compilation different?
+FPGAs differ from CPUs and GPUs in many interesting ways. However, in the scope of this tutorial, there is only one difference that matters: compared to CPU or GPU, generating a device image for FPGA hardware is a computationally intensive and time-consuming process. It is normal for an FPGA compile to take several hours to complete. 
+
+For this reason, only ahead-of-time (or "offline") kernel compilation mode is supported for FPGA. The long compile time for FPGA hardware makes just-in-time (or "online") compilation impractical.
+
+Long compile times are detrimental to developer productivity. The Intel® oneAPI DPC++ Compiler provides several mechanisms that enable DPC++ developers targeting FPGA to iterate quickly on their designs. By circumventing the time-consuming process of full FPGA compilation wherever possible, DPC++ FPGA developers can enjoy the fast compile times familiar to CPU and GPU developers.
+
+
+### Three types of DPC++ FPGA compilation
+The three types of FPGA compilation are summarized in the table below.
+
+| Device Image Type    | Time to Compile | Description
+---                    |---              |---
+| FPGA Emulator        | seconds         | The FPGA device code is compiled to the CPU. <br> This is used to verify the code's functional correctness.
+| Optimization Report  | minutes         | The FPGA device code is partially compiled for hardware. <br> The compiler generates an optimization report that describes the structures generated on the FPGA, identifies performance bottlenecks, and estimates resource utilization.
+| FPGA Hardware        | hours           | Generates the real FPGA bitstream to execute on the target FPGA platform
+
+The typical FPGA DPC++ development workflow is to iterate in each of these stages, refining the code using the feedback provided by that stage. Intel® recommends relying on emulation and the optimization report whenever possible.
+
+Compiling for FPGA emulation or to generate the FPGA optimization report requires only the Intel® oneAPI DPC++ Compiler (part of the Intel® oneAPI Base Toolkit). An FPGA hardware compile requires the Intel® FPGA Add-On for oneAPI Base Toolkit.
+
+
+#### FPGA Emulator
+
+The FPGA emulator is the fastest method to verify the correctness of your code. The FPGA emulator executes DPC++ device code on the CPU. The emulator is similar to the SYCL* host device, but unlike the host device the FPGA emulator device supports FPGA extensions such as FPGA pipes and `fpga_reg`.
+
+There are two important caveats to remember when using the FPGA emulator.
+*  **Performance is not representative.** It is not meaningful to evaluate performance on the FPGA emulator, as it is not representative of the behavior of the FPGA device. For example, an optimization that yields a 100x performance improvement on the FPGA may show no impact on the emulator performance, or it may show an unrelated increase or decrease.
+* **Undefined behavior may differ.** If your code produces different results when compiled for the FPGA emulator versus FPGA hardware, it is likely that your code is exercising undefined behavior. By definition, undefined behavior is not specified by the language specification, and may manifest differently on different targets. 
+
+#### Optimization Report
+An full FPGA compilation occurs in two stages:
+1. **FPGA early image:** The DPC++ device code is optimized and converted into an FPGA design specified in Verilog RTL (a low-level, native entry language for FPGAs). This intermediate compilation result is the FPGA early device image, which is *not* executable. This FPGA early image compilation process takes minutes.
+2. **FPGA hardware image:** The Verilog RTL specifying the design's circuit topology is mapped onto the FPGA's sea of primitive hardware resources by the Intel® Quartus® Prime software.  Intel® Quartus® Prime is included in the Intel® FPGA Add-On, which is required for this compilation stage. The result is an FPGA hardware binary (also referred to as a bitstream). This compilation process takes hours.
+
+Optimization reports are generated after both stages. The optimization report generated after the FPGA early device image, sometimes called the "static report", contains significant information about how the compiler has transformed your DPC++ device code into an FPGA design. The report contains visualizations of structures generated on the FPGA, performance and expected performance bottleneck information, and estimated resource utilization.
+
+The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide/top/analyze-your-design.html) contains a chapter on how to analyze the reports generated after the FPGA early image and FPGA image. 
+
+#### FPGA Hardware
+This is a full compile through to the FPGA hardware image. You can target the Intel® PAC with Intel Arria® 10 GX FPGA, the Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA, or a custom board.
+
+### Device Selectors
+The following code  snippet demonstrates how you can specify the target device in your source code. The selector is used to specify the target device at runtime.
+
+```c++
+// FPGA device selectors are defined in this utility header
+#include <CL/sycl/intel/fpga_extensions.hpp>
+
+int main() {
+  // Select either:
+  //  - the FPGA emulator device (CPU emulation of the FPGA)
+  //  - the FPGA device (a real FPGA)
+#if defined(FPGA_EMULATOR)
+  intel::fpga_emulator_selector device_selector;
+#else
+  intel::fpga_selector device_selector;
+#endif
+ 
+  queue q(device_selector); 
+  ...
+}
+```
+Notice that the FPGA emulator and the FPGA are are different target devices. It is recommended to use a preprocessor define to choose between the emulator and FPGA selectors.  This makes it easy to switch between targets using only command-line flags. Since the FPGA only supports ahead-of-time compilation, dynamic selectors (such as the default_selector) are less useful than explicit selectors when targeting FPGA.
+
+
+### Compiler Flags
+Here is a cheat sheet of the DPC++ compiler commands to compile for the FPGA emulator, to generate the FPGA early image optimization reports, and to compile for FPGA hardware.
+```
+# FPGA emulator
+dpcpp -fintelfpga -DFPGA_EMULATOR fpga_compile.cpp -o fpga_compile.fpga_emu 
+
+# Optimization report (default board)
+dpcpp -fintelfpga -Xshardware -fsycl-link fpga_compile.cpp -o fpga_compile_report.a  
+# Optimization report (explicit board)
+dpcpp -fintelfpga -Xshardware -fsycl-link -Xsboard=intel_s10sx_pac:pac_s10 fpga_compile.cpp -o fpga_compile_report.a 
+
+# FPGA hardware (default board)
+dpcpp -fintelfpga -Xshardware fpga_compile.cpp -o fpga_compile.fpga 
+# FPGA hardware (explicit board)
+dpcpp -fintelfpga -Xshardware -Xsboard=intel_s10sx_pac:pac_s10 fpga_compile.cpp -o fpga_compile.fpga 
+```
+
+The compiler flags used to achieve this are explained below.
+| Flag              | Explanation 
+---                 |---              
+| `-fintelfpga`     | Perform ahead-of-time compilation for FPGA.     
+| `-DFPGA_EMULATOR` | Adds a preprocessor define (see code snippet above).        
+| `-Xshardware`     | `-Xs` is used to pass arguments to the FPGA backend. <br> Since emulator is the default FPGA target, you must pass `Xshardware` to instruct the compiler to target FPGA hardware.
+| `-Xsboard`        | Optional argument to specify the FPGA board target. <br> If omitted, a default FPGA board is chosen.  
+| `-fsycl-link`     | This is synonymous with `-fsycl-link=early`. <br> It instructs the compile to stop after creating the FPGA early image (and associated optimization report).
+
+Notice that whether you are targeting the FPGA emulator or FPGA hardware must be specified twice: through compiler flags for the ahead-of-time compilation, and through the device selector for the runtime.
+
+
+## Key Concepts
+* How and why compiling DPC++ to FPGA differs from CPU or GPU 
+* FPGA device image types and when to use them 
+* The compile flags used to target FPGA
+
+## License  
+This code sample is licensed under MIT license.
+
+
+## Building the `fpga_compile` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for [emulation](#fpga-emulator) (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the [optimization report](#optimization-report): 
+     ```
+     make report
+     ``` 
+   * Compile for [FPGA hardware](#fpga-hardware) (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/pipe_array.fpga.tar.gz" download>here</a>.
+
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+
+## Examining the Reports
+Locate `report.html` in the `fpga_compile_report.prj/reports/` or `fpga_compile_s10_pac_report.prj/reports/` directory. Open the report in any of Chrome*, Firefox*, Edge*, or Internet Explorer*.
+
+Browse the reports that were generated for the `VectorAdd` kernel's FPGA early image. You may also wish to examine the reports generated by the full FPGA hardware compile and compare their contents.
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./fpga_compile.fpga_emu     (Linux)
+     fpga_compile.fpga_emu.exe   (Windows)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./fpga_compile.fpga         (Linux)
+     ```
+
+### Example of Output
+```
+PASSED: results are correct
+```
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/fpga_compile.sln b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/fpga_compile.sln
new file mode 100755
index 0000000000..248072508d
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/fpga_compile.sln
@@ -0,0 +1,25 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.705
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "fpga_compile", "fpga_compile.vcxproj", "{6271F8A8-6391-4040-BE74-71DDBD75CB64}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{6271F8A8-6391-4040-BE74-71DDBD75CB64}.Debug|x64.ActiveCfg = Debug|x64
+		{6271F8A8-6391-4040-BE74-71DDBD75CB64}.Debug|x64.Build.0 = Debug|x64
+		{6271F8A8-6391-4040-BE74-71DDBD75CB64}.Release|x64.ActiveCfg = Release|x64
+		{6271F8A8-6391-4040-BE74-71DDBD75CB64}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {8122B579-CEB9-4397-AD32-FC1D48EE832E}
+	EndGlobalSection
+EndGlobal
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/fpga_compile.vcxproj b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/fpga_compile.vcxproj
new file mode 100755
index 0000000000..2e4c2fb7aa
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/fpga_compile.vcxproj
@@ -0,0 +1,160 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{6271f8a8-6391-4040-be74-71ddbd75cb64}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>fpga_compile</RootNamespace>
+    <WindowsTargetPlatformVersion>$(WindowsSDKVersion.Replace("\",""))</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>Intel(R) oneAPI DPC++ Compiler</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)fpga_compile.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <ConformanceMode>true</ConformanceMode>
+      <PrecompiledHeaderFile>pch.h</PrecompiledHeaderFile>
+      <EnableFPGACompilationAhead>true</EnableFPGACompilationAhead>
+      <AdditionalOptions>-DFPGA_EMULATOR %(AdditionalOptions)</AdditionalOptions>
+      <ObjectFileName>$(IntDir)fpga_compile.obj</ObjectFileName>
+      <AdditionalIncludeDirectories>$(ONEAPI_ROOT)dev-utilities\latest\include</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="src\fpga_compile.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="README.md" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/sample.json
new file mode 100755
index 0000000000..9fa4654c33
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/sample.json
@@ -0,0 +1,51 @@
+{
+  "guid": "A211FDE2-B037-4069-BD84-C45E354798B7",
+  "name": "Compiling DPC++ for FPGA",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "FPGA tutorial introducing how to compile DPC++ for FPGA.",
+  "toolchain": ["dpcpp"],
+  "os": ["linux", "windows"],
+  "targetDevice": ["FPGA"],
+  "builder": ["ide", "cmake"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./fpga_compile.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ],
+    "windows": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "cd src",
+          "ninja fpga_emu",
+          "fpga_compile.fpga_emu.exe"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "cd src",
+          "ninja report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/CMakeLists.txt
new file mode 100755
index 0000000000..4fa57ebc9c
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/CMakeLists.txt
@@ -0,0 +1,89 @@
+set(SOURCE_FILE fpga_compile.cpp)
+set(TARGET_NAME fpga_compile)
+
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga")
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR ")
+set(EMULATOR_LINK_FLAGS "-fintelfpga ")
+
+# fpga emulator
+if(WIN32)
+    set(WIN_EMULATOR_TARGET ${EMULATOR_TARGET}.exe)
+    add_custom_target(fpga_emu DEPENDS ${WIN_EMULATOR_TARGET})
+    separate_arguments(WIN_EMULATOR_COMPILE_FLAGS WINDOWS_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+    add_custom_command(OUTPUT ${WIN_EMULATOR_TARGET} 
+                   COMMAND ${CMAKE_CXX_COMPILER} ${WIN_EMULATOR_COMPILE_FLAGS} /GX ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${WIN_EMULATOR_TARGET}
+                   DEPENDS ${SOURCE_FILE})
+
+else()
+    add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+    add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+    set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+endif()
+
+# fpga
+if(WIN32)
+    add_custom_target(fpga
+                  COMMAND echo "FPGA hardware flow is not supported in Windows")
+else()
+    add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+    add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+    set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS})
+    set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS})
+endif()
+
+# generate report
+if(WIN32)
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST WINDOWS_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+        COMMAND ${CMAKE_CXX_COMPILER} /EHsc ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                    DEPENDS ${SOURCE_FILE})
+
+else()
+    set(DEVICE_OBJ_FILE ${TARGET_NAME}_report.a)
+    add_custom_target(report DEPENDS ${DEVICE_OBJ_FILE})
+
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+
+    separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+    add_custom_command(OUTPUT ${DEVICE_OBJ_FILE} 
+                       COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link ${SOURCE_FILE} -o ${CMAKE_BINARY_DIR}/${DEVICE_OBJ_FILE}
+                    DEPENDS ${SOURCE_FILE})
+endif()
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}.fpga_emu
+                  DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/build.ninja b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/build.ninja
new file mode 100755
index 0000000000..9dee50b9f6
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/build.ninja
@@ -0,0 +1,30 @@
+source_file = fpga_compile.cpp
+target_name = fpga_compile
+
+emulator_target = ${target_name}.fpga_emu.exe
+report_target = ${target_name}_report.a
+report_target_s10_pac = ${target_name}_s10_pac_report.a
+
+hardware_flags = -fintelfpga -Xshardware
+emulator_flags = -fintelfpga -DFPGA_EMULATOR
+
+rule build_fpga_emu
+  command = dpcpp /GX ${emulator_flags} $in -o $out
+
+rule gen_report
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_a10gx_pac:pac_a10 -fsycl-link $in -o $out
+
+rule gen_report_s10_pac
+  command = dpcpp /GX ${hardware_flags} -Xsboard=intel_s10sx_pac:pac_s10 -fsycl-link $in -o $out
+
+# FPGA emulator
+build fpga_emu: phony ${emulator_target}
+build ${emulator_target}: build_fpga_emu ${source_file}
+
+# report
+build report: phony ${report_target}
+build ${report_target}: gen_report ${source_file}
+
+# report (S10 PAC)
+build report_s10_pac: phony ${report_target_s10_pac}
+build ${report_target_s10_pac}: gen_report_s10_pac ${source_file}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/fpga_compile.cpp b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/fpga_compile.cpp
new file mode 100755
index 0000000000..d0e1dcb963
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/GettingStarted/fpga_compile/src/fpga_compile.cpp
@@ -0,0 +1,118 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <iostream>
+#include <vector>
+#include "dpc_common.hpp"
+
+using namespace sycl;
+
+// Vector size for this example
+constexpr size_t kSize = 1024;
+
+// Forward declaration of the kernel name
+// (This will become unnecessary in a future compiler version.)
+class VectorAdd;
+
+
+int main() {
+
+  // Set up three vectors and fill two with random values.
+  std::vector<int> vec_a(kSize), vec_b(kSize), vec_r(kSize);
+  for (int i = 0; i < kSize; i++) {
+    vec_a[i] = rand();
+    vec_b[i] = rand();
+  }
+  
+  // Select either:
+  //  - the FPGA emulator device (CPU emulation of the FPGA)
+  //  - the FPGA device (a real FPGA)
+#if defined(FPGA_EMULATOR)
+  intel::fpga_emulator_selector device_selector;
+#else
+  intel::fpga_selector device_selector;
+#endif
+  
+  try { 
+
+    // Create a queue bound to the chosen device.
+    // If the device is unavailable, a SYCL runtime exception is thrown.
+    queue q(device_selector, dpc_common::exception_handler); 
+
+    // Print out the device information. 
+    std::cout << "Running on device: "
+              << q.get_device().get_info<info::device::name>() << "\n";
+
+    {
+      // Create buffers to share data between host and device.
+      // The runtime will copy the necessary data to the FPGA device memory
+      // when the kernel is launched.
+      buffer buf_a(vec_a);
+      buffer buf_b(vec_b);
+      // Use verbose SYCL 1.2 syntax for the output buffer.
+      // (This will become unnecessary in a future compiler version.)
+      buffer<int,1> buf_r(vec_r.data(), kSize);
+      
+      
+      // Submit a command group to the device queue.
+      q.submit([&](handler& h) {
+        
+        // The SYCL runtime uses the accessors to infer data dependencies.
+        // A "read" accessor must wait for data to be copied to the device
+        // before the kernel can start. A "write discard" accessor does not.
+        auto a = buf_a.get_access<access::mode::read>(h);
+        auto b = buf_b.get_access<access::mode::read>(h);
+        auto r = buf_r.get_access<access::mode::discard_write>(h);
+
+        // The kernel uses single_task rather than parallel_for.
+        // The task's for loop is executed in pipeline parallel on the FPGA,
+        // exploiting the same parallelism as an equivalent parallel_for.
+        h.single_task<VectorAdd>([=]() { 
+          for (int i = 0; i < kSize; ++i) {
+            r[i] = a[i] + b[i];
+          }
+        });
+      });
+      
+      // The buffer destructor is invoked when the buffers pass out of scope.
+      // buf_r's destructor updates the content of vec_r on the host.
+    }
+    
+    // The queue destructor is invoked when q passes out of scope.
+    // q's destructor invokes q's exception handler on any device exceptions.
+  } 
+  catch (sycl::exception const& e) {
+    // Catches exceptions in the host code
+    std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+    
+    // Most likely the runtime couldn't find FPGA hardware!
+    if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+      std::cout << "If you are targeting an FPGA, please ensure that your "
+                   "system has a correctly configured FPGA board.\n";
+      std::cout << "If you are targeting the FPGA emulator, compile with "
+                   "-DFPGA_EMULATOR.\n"; 
+    }
+    std::terminate();
+  } 
+  
+  // Check the results.
+  int correct = 0;
+  for (int i = 0; i < kSize; i++) {
+    if ( vec_r[i] == vec_a[i] + vec_b[i] ) {
+      correct++;
+    }
+  }
+
+  // Summarize and return.
+  if (correct == kSize) {
+    std::cout << "PASSED: results are correct\n";
+  } else {
+    std::cout << "FAILED: results are incorrect\n";
+  }
+
+  return !(correct == kSize);
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/CMakeLists.txt
new file mode 100755
index 0000000000..35161c6113
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(SystemProfiling)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/README.md
new file mode 100755
index 0000000000..12960b3317
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/README.md
@@ -0,0 +1,300 @@
+
+# Using the Intercept Layer for OpenCL* Applications to Identify Optimization Opportunities
+This FPGA tutorial demonstrates how to use the Intercept Layer for OpenCL* Applications to perform system-level profiling on a design and reveal areas for improvement.
+
+***Documentation***: The [Intercept Layer for OpenCL* Applications](https://github.com/intel/opencl-intercept-layer) GitHub provides complete documentation for the use of this tool. The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. 
+
+
+
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               | Summary of profiling tools available for performance optimization <br> About the Intercept Layer for OpenCL* Applications <br> How to set up and use this tool <br> A case study of using this tool to identify when the double buffering system-level optimization is beneficial
+| Time to complete                  | 30 minutes
+
+_Notice: Tutorial is not supported  on  Windows* as compiling  to FPGA hardware  is  not  yet supported  in  Windows*_
+
+## Purpose
+This FPGA tutorial demonstrates how to use the Intercept Layer for OpenCL* Applications, an open-source tool, to perform system-level profiling on a design and reveal areas for improvement.
+
+### Profiling Techniques
+The following code snippet uses standard SYCL* and C++ language features to extract profiling information from DPC++ code.
+
+```c++
+void profiling_example(const std::vector<float>& vec_in, 
+                             std::vector<float>& vec_out ) {
+
+  // Start the timer (using std::chrono)
+  dpc_common::TimeInterval exec_time;
+
+  // Host performs pre-processing of input data
+  std::vector<float> vec_pp = PreProcess(vec_in);
+
+  // FPGA device performs additional processing
+  intel::fpga_selector selector;
+  queue q(selector, dpc_common::exception_handler, 
+          property::queue::enable_profiling{});
+
+  buffer buf_in(vec_pp);
+  buffer buf_out(vec_out);
+  
+  event e = q.submit([&](handler &h) {
+    auto acc_in = buf_in.get_access<access::mode::read>(h);
+    auto acc_out = buf_out.get_access<access::mode::discard_write>(h);
+
+    h.single_task<class Kernel>([=]() [[intel::kernel_args_restrict]] {
+      DeviceProcessing(acc_in, acc_out); 
+    });
+  });
+  
+  // Query event e for kernel profiling information
+  // (blocks until command groups associated with e complete)  
+  double kernel_time_ns =
+    e.get_profiling_info<info::event_profiling::command_end>() -
+    e.get_profiling_info<info::event_profiling::command_start>();
+
+  // Stop the timer.
+  double total_time_s = exec_time.Elapsed();
+  
+  // Report profiling info
+  std::cout << "Kernel compute time:  " << kernel_time_ns * 1e-6 << " ms\n";
+  std::cout << "Total compute time:   " << total_time_s   * 1e3  << " ms\n";
+}
+```
+
+This tutorial introduces the Intercept Layer for OpenCL* Applications, a profiling tool that extracts and visualizes system-level profiling information for DPC++ programs.  This tool can extract the same profiling data (and more) as the code snippet above, without requiring any code-level profiling directives. 
+
+The Intercept Layer for OpenCL* provides coarse-grained, system-level profiling information. A complementary tool, the Intel® VTune™ Profiler, provides fine-grained profiling information for the kernels executing on the device. Together, these two tools can be used to optimize both host and device side execution.
+
+### The Intercept Layer for OpenCL* Applications
+
+The Intercept Layer for OpenCL* Applications is an open-source tool that you can use to profile DPC++ designs at a system-level. Although it is not part of the oneAPI Base Toolkit installation, it is freely available on GitHub.
+
+This tool serves the following purpose:
+* Intercept host calls before they reach the device in order to gather performance data and log host calls. 
+* Provide data to visualize the calls through time, and can separate them into *queued*, *submitted*, and *execution* sections for a better understanding of the execution. 
+* Identify gaps (using visualization) in the runtime that may be leading to inefficient execution and throughput drops.
+
+The Intercept Layer for OpenCL* Applications has several different options for capturing different aspects of the host run. These options are described in its [documentation](https://github.com/intel/opencl-intercept-layer). This tutorial uses the call-logging and device timeline features that print information about the calls made by the host during execution.
+
+### Data Visualization
+
+You can visualize the data generated by the Intercept Layer for OpenCL* Applications in the following ways:
+* __Google* Chrome* trace event profiling tool__: JSON files generated by the Intercept Layer for OpenCL Applications contain device timeline information. You can open these JSON files in the [Google* Chrome* trace event profiling tool](chrome://tracing/) to generate visual representation of the profiling data.
+* __Microsoft* Excel*__: The Intercept Layer for OpenCL* Applications contains a Python script that parses the timeline information into a Microsoft* Excel* file, where it is presented both in a table format and in a bar graph.
+
+This tutorial will use the Google* Chrome trace event profiling tool for visualization.
+
+Use the visualized data to identify gaps in the runtime where events are waiting for something else to finish executing. These gaps represent potential opportunities for system-level optimization. While it is not possible to eliminate all such gaps, you might be able to eliminate those caused by dependencies that can be avoided.
+
+### Tutorial Example: Double Buffering
+
+This tutorial is based on the *double-buffering* optimization. Double-buffering allows host data processing and host transfers to the device-side buffer to occur in parallel with the kernel execution on the FPGA device. This parallelization is useful when the host performs any combination of the following actions between consecutive kernel runs:
+* Preprocessing 
+* Postprocessing 
+* Writes to the device buffer 
+ 
+By running host and device actions in parallel, execution gaps between kernels are removed as they no longer have to wait for the host to finish its operation. You can clearly see the benefits of double-buffering with the visualizations provided by the Intercept Layer output.
+
+### Setting up the Intercept Layer for OpenCL* Applications
+The Intercept Layer for OpenCL* Applications is available on GitHub at the following URL: <https://github.com/intel/opencl-intercept-layer>
+
+To set up the Intercept Layer for OpenCL* Applications, perform the following steps:
+
+1) [Download](https://github.com/intel/opencl-intercept-layer) the Intercept Layer for OpenCL* Applications version 2.2.1 or later from GitHub. 
+
+
+2) Build the Intercept Layer according to the instructions provided in [How to Build the Intercept Layer for OpenCL* Applications](https://github.com/intel/opencl-intercept-layer/blob/master/docs/build.md). 
+     *  __Run `cmake`__: Ensure that you set `ENABLE_CLILOADER=1` when running cmake.
+      (i.e. `cmake -DENABLE_CLILOADER=1 ..` )
+    * __Run `make`__: After the cmake step, `make` must be run in the build directory. This step builds the `cliloader` loader utility.
+    * __Add to your `PATH`__:    The `cliloader` executable should now exist in `<path to opencl-intercept-layer-master download>/<build dir>/cliloader/` directory. Add this directory to your `PATH` environment variable if you wish to run multiple designs using `cliloader`.
+
+    You can now pass your executables to `cliloader` to run them with the intercept layer. For details about the `cliloader` loader utility, see [cliloader: A Intercept Layer for OpenCL* Applications Loader](https://github.com/intel/opencl-intercept-layer/blob/master/docs/cliloader.md).
+
+3) Set `cliloader` and other Intercept Layer options.
+
+    If you run multiple designs with the same options, set up a `clintercept.conf` file in your home directory. You can also set the options as environment variables by prefixing the option name with `CLI_`. For example, the `DllName` option can be set through the `CLI_DllName` environment variable. For a list of options, see *Controls* in [How to Use the Intercept Layer for OpenCL Applications](https://github.com/intel/opencl-intercept-layer/blob/master/docs/controls.md).
+
+    For this tutorial, set the following options:
+
+| Options/Variables                                                                       | Description                                                                                                         |
+| ---                                                                                     | ---                                                                                                                 |
+| `DllName=$CMPLR_ROOT/linux/lib/libOpenCL.so`                  | The intercept layer must know where `libOpenCL.so` file from the original oneAPI build is.                          |
+| `DevicePerformanceTiming=1` and `DevicePerformanceTimelineLogging=1`                    | These options print out runtime timeline information in the output of the executable run.                           | 
+| `ChromePerformanceTiming=1`, `ChromeCallLogging=1`, `ChromePerformanceTimingInStages=1` | These variables set up the chrome tracer output, and ensure the output has Queued, Submitted, and Execution stages. |
+   
+
+These instructions set up the `cliloader` executable, which provides some flexibility by allowing for more control over when the layer is used or not used. If you prefer a local installation (for a single design) or a global installation (always ON for all designs), follow the instructions at [How to Install the Intercept Layer for OpenCL Applications](https://github.com/intel/opencl-intercept-layer/blob/master/docs/install.md).
+
+### Running the Intercept Layer for OpenCL* Applications
+
+To run a compiled DPC++ program using the Intercept Layer for OpenCL* Applications, use the command:
+`cliloader <executable> [executable args]` 
+
+To run the tutorial example, refer to the "[Running the Sample](#running-the-sample)" section.
+
+When you run the host executable with the `cliloader` command, the `stderr` output contains lines as shown in the following example:
+```
+Device Timeline for clEnqueueWriteBuffer (enqueue 1) = 63267241140401 ns (queued), 63267241149579 ns (submit), 63267241194205 ns (start), 63267242905519 ns (end)
+```
+
+These lines give the timeline information about a variety of oneAPI runtime calls. After the host executable finishes running, there is also a summary of the performance information for the run.
+
+### Viewing the Performance Data
+
+After the executable runs, the data collected will be placed in the `CLIntercept_Dump` directory, which is in the home directory by default. Its location can be adjusted using the `DumpDir=<directory where you want the output files>` `cliloader` option. `CLIntercept_Dump` contains a file called `clintercept_trace.json`. You can load this JSON file in the [Google* Chrome trace event profiling tool](chrome://tracing/) to visualize the timeline data collected by the run.
+
+For this tutorial, this visualization appears as shown in the following example:
+
+![](full_example_trace.PNG)
+
+This visualization shows different calls executed through time. The X-axis is time, with the scale shown near the top of the page. The Y-axis shows different calls that are split up in several ways.
+
+The left side (Y-axis) has two different types of numbers:
+* Numbers that contain a decimal point. 
+   * The part of the number before the decimal point orders the calls approximately by start time. 
+   * The part of the number after the decimal point represents the queue number the call was made in.
+* Numbers that do not contain a decimal point. These numbers represent the thread ID of the thread being run on in the operating system.
+
+The colors in the trace represent different stages of execution: 
+* Blue during the *queued* stage
+* Yellow during the *submitted* stage
+* Orange for the *execution* stage
+
+Look for gaps between consecutive execution stages and kernel runs to identify possible areas for optimization.
+
+
+### Applying Double-Buffering Using the Intercept Layer for OpenCL* Applications
+
+The double-buffering optimization can help minimize or remove gaps between consecutive kernels as they wait for host processing to finish. These gaps are minimized or removed by having the host perform processing operations on a second set of buffers while the kernel executes. With this execution order, the host processing is done by the time the next kernel can run, so kernel execution is not held up waiting for the host. 
+
+For a more detailed explanation of the optimization, refer to the FPGA tutorial "Double Buffering to Overlap Kernel Execution with Buffer Transfers and Host Processing".
+
+In this tutorial, the first three kernels are run without the double-buffer optimization, and the next three are run with it. The kernels were run on an Intel® Programmable Acceleration Card with Intel® Arria® 10 GX FPGA when the intercept layer data was collected. The change made by this optimization can be clearly seen in the Intercept Layer for OpenCL* Applications trace:
+
+![](with_and_without_double_buffering.PNG)
+
+Here, the kernel runs named `_ZTS10SimpleVpow` can be recognized as the bars with the largest execution time (the large orange bars). Double buffering removes the gaps between the kernel executions that can be seen in the top trace image. This optimization improves the throughput of the design, as explained in the `double_buffering` tutorial.
+
+The Intercept Layer for OpenCL* Applications makes it clear why the double buffering optimization will benefit this design, and shows the performance improvement it achieves. Use the Intercept Layer tool on your designs to identify scenarios where you can apply double buffering and other system-level optimizations.
+
+
+## Key Concepts
+* A brief summary of the key profiling tools available for DPC++ performance optimization
+* Understanding the Intercept Layer for OpenCL* Applications tool
+* How to set up and use the Intercept Layer for OpenCL* Applications tool
+* How to use the resulting information to identify opportunities for system-level optimizations such as double buffering
+
+## License  
+This code sample is licensed under MIT license.
+
+## Building the Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/system_profiling.fpga.tar.gz" download>here</a>.
+
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*) and the Visual Studio* IDE (in Windows*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./double_buffering.fpga_emu     (Linux)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./double_buffering.fpga         (Linux)
+     ```
+3. Follow the instructions in the "[Setting up the Intercept Layer for OpenCL* Applications](#setting-up-the-intercept-layer-for-opencl-applications)" section to install and configure the `cliloader` tool.
+4. Run the sample using the Intercept Layer for OpenCL* Applications to obtain system-level profiling information:
+     ```
+     cliloader ./double_buffering.fpga   (Linux)
+     ```
+5. Follow the instructions in the "[Viewing the Performance Data](#viewing-the-performance-data)" section to visualize the results. 
+
+### Example of Output
+__Intercept Layer for OpenCL* Applications results:__
+Your visualization results should resemble the screenshots in sections "[Viewing the Performance Data](#viewing-the-performance-data)" and "[Applying Double-Buffering Using the Intercept Layer for OpenCL* Applications](#applying-double-buffering-using-the-intercept-layer-for-opencl-applications)".
+
+__Command line `stdout`:__
+When run without `cliloader`, the tutorial output should resemble the result below.
+```
+Platform name: Intel(R) FPGA SDK for OpenCL(TM)
+Device name: pac_a10 : Intel PAC Platform (pac_ee00000)
+
+Executing kernel 100 times in each round.
+
+*** Beginning execution, without double buffering
+Launching kernel #0
+Launching kernel #10
+Launching kernel #20
+Launching kernel #30
+Launching kernel #40
+Launching kernel #50
+Launching kernel #60
+Launching kernel #70
+Launching kernel #80
+Launching kernel #90
+
+Overall execution time without double buffering = 29742 ms
+Total kernel-only execution time without double buffering = 17856 ms
+Throughput = 35.255249 MB/s
+
+*** Beginning execution, with double buffering.
+Launching kernel #0
+Launching kernel #10
+Launching kernel #20
+Launching kernel #30
+Launching kernel #40
+Launching kernel #50
+Launching kernel #60
+Launching kernel #70
+Launching kernel #80
+Launching kernel #90
+
+Overall execution time with double buffering = 17967 ms
+Total kernel-only execution time with double buffering = 17869 ms
+Throughput = 58.35976 MB/s
+
+Verification PASSED
+```
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/full_example_trace.PNG b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/full_example_trace.PNG
new file mode 100755
index 0000000000..92d37fc5dc
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/full_example_trace.PNG differ
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/sample.json
new file mode 100755
index 0000000000..c32c6f4f65
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/sample.json
@@ -0,0 +1,25 @@
+{
+  "guid": "9D7E5A6D-A39B-4FF8-B553-4B85116FCD69",
+  "name": "Using the OpenCL Intercept Layer to Profile Designs running on the FPGA",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "FPGA tutorial demonstrating how to use the OpenCL Intercept Layer to improve a design with the double buffering optimization",
+  "toolchain": ["dpcpp"],
+  "os": ["linux"],
+  "targetDevice": ["FPGA"],
+  "builder": ["cmake"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./double_buffering.fpga_emu"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/src/CMakeLists.txt
new file mode 100755
index 0000000000..78877c0592
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/src/CMakeLists.txt
@@ -0,0 +1,52 @@
+set(SOURCE_FILE double_buffering.cpp)
+set(TARGET_NAME double_buffering)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga")
+
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga")
+
+# fpga emulator
+add_executable(${EMULATOR_TARGET} ${SOURCE_FILE})
+add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+set_target_properties(${EMULATOR_TARGET} PROPERTIES COMPILE_FLAGS ${EMULATOR_COMPILE_FLAGS})
+set_target_properties(${EMULATOR_TARGET} PROPERTIES LINK_FLAGS ${EMULATOR_LINK_FLAGS})
+
+# fpga
+add_executable(${FPGA_TARGET} EXCLUDE_FROM_ALL ${SOURCE_FILE})
+add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+set_target_properties(${FPGA_TARGET} PROPERTIES COMPILE_FLAGS ${HARDWARE_COMPILE_FLAGS})
+set_target_properties(${FPGA_TARGET} PROPERTIES LINK_FLAGS ${HARDWARE_LINK_FLAGS})
+
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}.fpga_emu
+                  DEPENDS ${TARGET_NAME}.fpga_emu)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/src/double_buffering.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/src/double_buffering.cpp
new file mode 100755
index 0000000000..9884295b08
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/src/double_buffering.cpp
@@ -0,0 +1,353 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include <iomanip>
+#include <random>
+
+#include "dpc_common.hpp"
+
+using namespace sycl;
+
+// For the system_profiling tutorial, we execute the kernel only a few times.
+// This makes it easier to examine the generated profiling graphs.
+// Note that the performance advantage of double buffering is more apparent on 
+// FPGA hardware with a larger number of kernel invocations.
+
+// kTimes = # times to execute the kernel. kTimes must be >= 2
+// kSize = # of floats to process on each kernel execution.
+#if defined(FPGA_EMULATOR)
+constexpr int kTimes = 3; 
+constexpr int kSize = 4096;
+#else
+constexpr int kTimes = 3; // originally 100
+constexpr int kSize = 2621440;
+#endif
+
+// Kernel executes a power function (base^kPow). Must be
+// >= 2. Can increase this to increase kernel execution
+// time, but ProcessOutput() time will also increase.
+constexpr int kPow = 20;
+
+// Number of iterations through the main loop
+constexpr int kNumRuns = 2;
+
+bool pass = true;
+
+class SimpleVpow;
+
+/*  Kernel function.
+    Performs buffer_b[i] = buffer_a[i] ** pow
+    Only supports pow >= 2.
+    This kernel is not meant to be an optimal implementation of the power
+   operation -- it's just a sample kernel for this tutorial whose execution time
+   is easily controlled via the pow parameter. SYCL buffers are created
+   externally and passed in by reference to control (external to this function)
+   when the buffers are destructed. The destructor causes a blocking buffer
+   transfer from device to host and double buffering requires us to not block
+   here (because we need to launch another kernel). So we only want this
+   transfer to occur at the end of overall execution, not at the end of each
+   individual kernel execution.
+*/
+void SimplePow(std::unique_ptr<queue> &q, buffer<float, 1> &buffer_a,
+               buffer<float, 1> &buffer_b, event &e) {
+  // Submit to the queue and execute the kernel
+  e = q->submit([&](handler &h) {
+    // Get kernel access to the buffers
+    auto accessor_a = buffer_a.get_access<access::mode::read>(h);
+    auto accessor_b = buffer_b.get_access<access::mode::discard_read_write>(h);
+
+    const int num = kSize;
+    assert(kPow >= 2);
+    const int p = kPow - 1;  // Assumes pow >= 2;
+
+    h.single_task<SimpleVpow>([=]() [[intel::kernel_args_restrict]] {
+      for (int j = 0; j < p; j++) {
+        if (j == 0) {
+          for (int i = 0; i < num; i++) {
+            accessor_b[i] = accessor_a[i] * accessor_a[i];
+          }
+        } else {
+          for (int i = 0; i < num; i++) {
+            accessor_b[i] = accessor_b[i] * accessor_a[i];
+          }
+        }
+      }
+    });
+  });
+
+  event update_host_event;
+  update_host_event = q->submit([&](handler &h) {
+    auto accessor_b = buffer_b.get_access<access::mode::read>(h);
+
+    /*
+      Explicitly instruct the SYCL runtime to copy the kernel's output buffer
+      back to the host upon kernel completion. This is not required for
+      functionality since the buffer access in ProcessOutput() also implicitly
+      instructs the runtime to copy the data back. But it should be noted that
+      this buffer access blocks ProcessOutput() until the kernel is complete
+      and the data is copied. In contrast, update_host() instructs the runtime
+      to perform the copy earlier. This allows ProcessOutput() to optionally
+      perform more useful work *before* making the blocking buffer access. Said
+      another way, this allows ProcessOutput() to potentially perform more work
+      in parallel with the runtime's copy operation.
+    */
+    h.update_host(accessor_b);
+  });
+}
+
+// Returns kernel execution time for a given SYCL event from a queue.
+ulong SyclGetExecTimeNs(event e) {
+  ulong start_time =
+      e.get_profiling_info<info::event_profiling::command_start>();
+  ulong end_time =
+      e.get_profiling_info<info::event_profiling::command_end>();
+  return (end_time - start_time);
+}
+
+// Local pow function for verifying results
+float MyPow(float input, int pow) {
+  return (pow == 0) ? 1 : input * MyPow(input, pow - 1);
+}
+
+/*  Compares kernel output against expected output. Only compares part of the
+   output so that this method completes quickly. This is done
+   intentionally/artificially keep host-processing time shorter than kernel
+   execution time. Grabs kernel output data from its SYCL buffer. Reading from
+   this buffer is a blocking operation that will block on the kernel completing.
+    Queries and records execution time of the kernel that just completed. This
+   is a natural place to do this because ProcessOutput() is blocked on kernel
+   completion.
+*/
+void ProcessOutput(buffer<float, 1> &input_buf,
+                   buffer<float, 1> &output_buf, int exec_number, event e,
+                   ulong &total_kernel_time_per_slot) {
+  auto input_buf_acc = input_buf.get_access<access::mode::read>();
+  auto output_buf_acc = output_buf.get_access<access::mode::read>();
+  int num_errors = 0;
+  int num_errors_to_print = 10;
+  /*  The use of update_host() in the kernel function allows for additional
+     host-side operations to be performed here, in parallel with the buffer copy
+     operation from device to host, before the blocking access to the output
+     buffer is made via output_buf_acc[]. To be clear, no real operations are
+     done here and this is just a note that this is the place
+      where you *could* do it. */
+  for (int i = 0; i < kSize / 8; i++) {
+    const bool out_valid = (MyPow(input_buf_acc[i], kPow) != output_buf_acc[i]);
+    if ((num_errors < num_errors_to_print) && out_valid) {
+      if (num_errors == 0) {
+        pass = false;
+        std::cout << "Verification failed on kernel execution # " << exec_number
+                  << ". Showing up to " << num_errors_to_print
+                  << " mismatches.\n";
+      }
+      std::cout << "Verification failed on kernel execution # " << exec_number
+                << ", at element " << i << ". Expected " << std::fixed
+                << std::setprecision(16) << MyPow(input_buf_acc[i], kPow)
+                << " but got " << output_buf_acc[i] << "\n";
+      num_errors++;
+    }
+  }
+
+  // At this point we know the kernel has completed,
+  // so can query the profiling data.
+  total_kernel_time_per_slot += SyclGetExecTimeNs(e);
+}
+
+/*
+    Generates input data for the next kernel execution. Only fills part of the
+   buffer so that this method completes quickly. This is done
+   intentionally/artificially keep host-processing time shorter than kernel
+   execution time. Writes the data into the associated SYCL buffer. The write
+   will block until the previous kernel execution, that is using this buffer,
+   completes.
+*/
+void ProcessInput(buffer<float, 1> &buf) {
+  // We are generating completely new input data, so can use discard_write()
+  // here to indicate we don't care about the SYCL buffer's current contents.
+  auto buf_acc = buf.get_access<access::mode::discard_write>();
+
+  // RNG seed
+  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+
+  // RNG engine
+  std::default_random_engine dre(seed);
+
+  // generate random numbers between 1 and 2
+  std::uniform_real_distribution<float> di(1.0f, 2.0f);
+
+  // Randomly generate a start value and increment from there.
+  // Compared to randomly generating every value, this is done to
+  // speed up this function a bit.
+  float start_val = di(dre);
+
+  for (int i = 0; i < kSize / 8; i++) {
+    buf_acc[i] = start_val;
+    start_val++;
+  }
+}
+
+int main() {
+// Create queue, get platform and device
+#if defined(FPGA_EMULATOR)
+    intel::fpga_emulator_selector device_selector;
+    std::cout << "\nEmulator output does not demonstrate true hardware "
+                 "performance. The design may need to run on actual hardware "
+                 "to observe the performance benefit of the optimization "
+                 "exemplified in this tutorial.\n\n";
+#else
+    intel::fpga_selector device_selector;
+#endif
+
+    try {
+      auto prop_list =
+          property_list{property::queue::enable_profiling()};
+      
+      std::unique_ptr<queue> q;
+      q.reset(new queue(device_selector, dpc_common::exception_handler, prop_list));
+
+      platform platform = q->get_context().get_platform();
+      device device = q->get_device();
+      std::cout << "Platform name: "
+                << platform.get_info<info::platform::name>().c_str() << "\n";
+      std::cout << "Device name: "
+                << device.get_info<info::device::name>().c_str() << "\n\n\n";
+
+      std::cout << "Executing kernel " << kTimes << " times in each round.\n\n";
+
+      // Create a vector to store the input/output SYCL buffers
+      std::vector<buffer<float, 1>> input_buf;
+      std::vector<buffer<float, 1>> output_buf;
+
+      // SYCL events for each kernel launch.
+      event sycl_events[2];
+
+      // In nanoseconds. Total execution time of kernels in a given slot.
+      ulong total_kernel_time_per_slot[2];
+
+      // Total execution time of all kernels.
+      ulong total_kernel_time = 0;
+
+      // Allocate vectors to store the host-side copies of the input data
+      // Create and allocate the SYCL buffers
+      for (int i = 0; i < 2; i++) {
+        input_buf.push_back(buffer<float, 1>(range<1>(kSize)));
+        output_buf.push_back(buffer<float, 1>(range<1>(kSize)));
+      }
+
+      /*
+        Main loop. This loop runs twice to show the performance difference without
+        and with double buffering.
+      */
+      for (int i = 0; i < kNumRuns; i++) {
+        for (int i = 0; i < 2; i++) {
+          total_kernel_time_per_slot[i] = 0;  // Initialize timers to zero.
+        }
+
+        switch (i) {
+          case 0: {
+            std::cout << "*** Beginning execution, without double buffering\n";
+            break;
+          }
+          case 1: {
+            std::cout << "*** Beginning execution, with double buffering.\n";
+            break;
+          }
+          default: {
+            std::cout << "*** Beginning execution.\n";
+          }
+        }
+
+        // Start the timer. This will include the time to process the input data
+        // for the first 2 kernel executions.
+        dpc_common::TimeInterval exec_time;
+
+        if (i == 0) {  // Single buffering
+          for (int i = 0; i < kTimes; i++) {
+            // Only print every few iterations, just to limit the prints.
+            if (i % 10 == 0) {
+              std::cout << "Launching kernel #" << i << "\n";
+            }
+
+            ProcessInput(input_buf[0]);
+            SimplePow(q, input_buf[0], output_buf[0], sycl_events[0]);
+            ProcessOutput(input_buf[0], output_buf[0], i, sycl_events[0],
+                          total_kernel_time_per_slot[0]);
+          }
+        } else {  // Double buffering
+          // Process input for first 2 kernel launches and queue them. Then block
+          // on processing the output of the first kernel.
+          ProcessInput(input_buf[0]);
+          ProcessInput(input_buf[1]);
+
+          std::cout << "Launching kernel #0\n";
+
+          SimplePow(q, input_buf[0], output_buf[0], sycl_events[0]);
+          for (int i = 1; i < kTimes; i++) {
+            if (i % 10 == 0) {
+              std::cout << "Launching kernel #" << i << "\n";
+            }  // Only print every few iterations, just to limit the prints.
+
+            // Launch the next kernel
+            SimplePow(q, input_buf[i % 2], output_buf[i % 2], sycl_events[i % 2]);
+
+            // Process output from previous kernel. This will block on kernel
+            // completion.
+            ProcessOutput(input_buf[(i - 1) % 2], output_buf[(i - 1) % 2], i,
+                          sycl_events[(i - 1) % 2],
+                          total_kernel_time_per_slot[(i - 1) % 2]);
+
+            // Generate input for the next kernel.
+            ProcessInput(input_buf[(i - 1) % 2]);
+          }
+
+          // Process output of the final kernel
+          ProcessOutput(input_buf[(kTimes - 1) % 2], output_buf[(kTimes - 1) % 2],
+                        i, sycl_events[(kTimes - 1) % 2],
+                        total_kernel_time_per_slot[(kTimes - 1) % 2]);
+        }
+
+        // Add up the overall kernel execution time.
+        total_kernel_time = 0;
+        for (int i = 0; i < 2; i++) {
+          total_kernel_time += total_kernel_time_per_slot[i];
+        }
+
+        // Stop the timer.
+        double time_span = exec_time.Elapsed();
+
+        std::cout << "\nOverall execution time "
+                  << ((i == 0) ? "without" : "with") << " double buffering = "
+                  << (unsigned)(time_span * 1000) << " ms\n";
+        std::cout << "Total kernel-only execution time "
+                  << ((i == 0) ? "without" : "with") << " double buffering = "
+                  << (unsigned)(total_kernel_time / 1000000) << " ms\n";
+        std::cout << "Throughput = " << std::setprecision(8)
+                  << (float)kSize * (float)kTimes * (float)sizeof(float) /
+                         (float)time_span / 1000000
+                  << " MB/s\n\n\n";
+      }
+      if (pass) {
+        std::cout << "Verification PASSED\n";
+      } else {
+        std::cout << "Verification FAILED\n";
+        return 1;
+      }
+    } catch (sycl::exception const& e) {
+      // Catches exceptions in the host code
+      std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+      
+      // Most likely the runtime couldn't find FPGA hardware!
+      if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+        std::cout << "If you are targeting an FPGA, please ensure that your "
+                     "system has a correctly configured FPGA board.\n";
+        std::cout << "If you are targeting the FPGA emulator, compile with "
+                     "-DFPGA_EMULATOR.\n"; 
+      }
+      std::terminate();
+    }
+  return 0;
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/with_and_without_double_buffering.PNG b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/with_and_without_double_buffering.PNG
new file mode 100755
index 0000000000..dffc959919
Binary files /dev/null and b/DirectProgramming/DPC++FPGA/Tutorials/Tools/system_profiling/with_and_without_double_buffering.PNG differ
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/CMakeLists.txt
new file mode 100755
index 0000000000..96498624f9
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(CMAKE_CXX_COMPILER "dpcpp")
+
+cmake_minimum_required (VERSION 2.8)
+
+project(UseLibrary)
+
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+add_subdirectory (src)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/License.txt b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/License.txt
new file mode 100755
index 0000000000..e63c6e13dc
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/License.txt
@@ -0,0 +1,7 @@
+Copyright Intel Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/README.md b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/README.md
new file mode 100755
index 0000000000..f713db02bb
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/README.md
@@ -0,0 +1,126 @@
+
+# Using FPGA Cross-Language Libraries
+This FPGA tutorial demonstrates how to build DPC++ device libraries from various sources and use them in your DPC++ design. 
+
+***Documentation***: The [oneAPI DPC++ FPGA Optimization Guide](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-fpga-optimization-guide)  provides comprehensive instructions for targeting FPGAs through DPC++. The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programming-guide) is a general resource for target-independent DPC++ programming. 
+
+| Optimized for                     | Description
+---                                 |---
+| OS                                | Linux* Ubuntu* 18.04
+| Hardware                          | Intel® Programmable Acceleration Card (PAC) with Intel Arria® 10 GX FPGA; <br> Intel® Programmable Acceleration Card (PAC) with Intel Stratix® 10 SX FPGA
+| Software                          | Intel® oneAPI DPC++ Compiler (Beta) <br> Intel® FPGA Add-On for oneAPI Base Toolkit 
+| What you will learn               | How to create and use libraries in DPC++ FPGA projects <br> How power users can incorporate RTL source code in DPC++ for FPGA
+| Time to complete                  | 15 minutes
+
+_Notice: The FPGA library feature is not yet supported in Windows*_
+
+## Purpose
+This FPGA tutorial demonstrates how to build DPC++ device libraries from multiple sources and use them in your DPC++ design. A library is useful for reusing and sharing code, or for separating code for testing purposes. Power users can also use libraries to leverage the features of other programming languages in their DPC++ FPGA designs.
+
+It is currently possible to generate FPGA library objects from the following source types:
+* Verilog or VHDL (modules or entities respectively)
+* Intel® High Level Synthesis Compiler (HLS) functions
+* OpenCL* 1.2 functions
+* SYCL* or DPC++ functions
+
+This code sample uses libraries from all four supported source types within a single project.
+
+### Generating a library 
+To create a library from  source code, use the following steps:
+
+1. `fpga_crossgen` creates object file that contains representations for target devices (FPGA) and FPGA emulator. The following commands instruct `fpga_crossgen` to generate DPC++ target objects from the four sources in this tutorial:
+   
+   ```
+   fpga_crossgen lib_hls.cpp --source hls --target sycl -o lib_hls.o
+   fpga_crossgen lib_ocl.cl --source ocl --target sycl -o lib_ocl.o
+   fpga_crossgen lib_sycl.cpp --source sycl --target sycl -o lib_sycl.o
+   fpga_crossgen lib_rtl_spec.xml --emulation_model lib_rtl_model.cpp --target sycl -o lib_rtl.o
+   ```
+     Notice that generating an RTL library requires that an `xml` file and emulation model be provided in addition to the Verilog source code. Examine the tutorial source code and the comments in `use_library.cpp` for more details.
+2. `fpga_libtool` collects one or more objects into a DPC++ library archive file. This command creates a single library archive file from the four object files generated by `fpga_crossgen` in the previous step:
+
+   ```
+   fpga_libtool lib_hls.o lib_ocl.o lib_rtl.o lib_sycl.o --target sycl --create lib.a
+   ```
+### Using the library
+To use the generated library in your project, simply add the generated library archive file to the list of input source files when invoking `dpcpp`. To compile the `use_library` tutorial, pass both `use_library.cpp` and `lib.a` as inputs.
+```
+# Compile for FPGA emulator
+dpcpp -fintelfpga use_library.cpp lib.a -o use_library_emu.fpga -DFPGA_EMULATOR
+
+# Compile for FPGA hardware
+dpcpp -fintelfpga use_library.cpp lib.a -o use_library.fpga -Xshardware
+```
+
+
+## Key Concepts
+* How to create and use libraries in DPC++ FPGA projects
+* How power users can incorporate RTL source code in DPC++ for FPGA
+
+## License  
+This code sample is licensed under MIT license.
+
+
+## Building the `use_library` Tutorial
+
+### Include Files
+The included header `dpc_common.hpp` is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system.
+
+### Running Samples in DevCloud
+If running a sample in the Intel DevCloud, remember that you must specify the compute node (fpga_compile or fpga_runtime) as well as whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide ([https://devcloud.intel.com/oneapi/get-started/base-toolkit/](https://devcloud.intel.com/oneapi/get-started/base-toolkit/)).
+
+When compiling for FPGA hardware, it is recommended to increase the job timeout to 12h.
+
+### On a Linux* System
+
+1. Generate the `Makefile` by running `cmake`.
+     ```
+   mkdir build
+   cd build
+   ```
+   To compile for the Intel® PAC with Intel Arria® 10 GX FPGA, run `cmake` using the command:  
+    ```
+    cmake ..
+   ```
+   Alternatively, to compile for the Intel® PAC with Intel Stratix® 10 SX FPGA, run `cmake` using the command:
+
+   ```
+   cmake .. -DFPGA_BOARD=intel_s10sx_pac:pac_s10
+   ```
+
+2. Compile the design through the generated `Makefile`. The following build targets are provided, matching the recommended development flow:
+
+   * Compile for emulation (fast compile time, targets emulated FPGA device): 
+      ```
+      make fpga_emu
+      ```
+   * Generate the optimization report: 
+     ```
+     make report
+     ``` 
+   * Compile for FPGA hardware (longer compile time, targets FPGA device): 
+     ```
+     make fpga
+     ``` 
+3. (Optional) As the above hardware compile may take several hours to complete, an Intel® PAC with Intel Arria® 10 GX FPGA precompiled binary can be downloaded <a href="https://software.intel.com/content/dam/develop/external/us/en/documents/use_library.fpga.tar.gz" download>here</a>.
+
+ ### In Third-Party Integrated Development Environments (IDEs)
+
+You can compile and run this tutorial in the Eclipse* IDE (in Linux*). For instructions, refer to the following link: [Intel® oneAPI DPC++ FPGA Workflows on Third-Party IDEs](https://software.intel.com/en-us/articles/intel-oneapi-dpcpp-fpga-workflow-on-ide)
+
+
+## Running the Sample
+
+ 1. Run the sample on the FPGA emulator (the kernel executes on the CPU):
+     ```
+     ./use_library.fpga_emu     (Linux)
+     ```
+2. Run the sample on the FPGA device:
+     ```
+     ./use_library.fpga         (Linux)
+     ```
+
+### Example of Output
+```
+PASSED: result is correct!
+```
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/sample.json b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/sample.json
new file mode 100755
index 0000000000..a9b38b95f2
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/sample.json
@@ -0,0 +1,34 @@
+{
+  "guid": "9605DCBF-6DDB-4FD2-812F-1ECF252AE334",
+  "name": "Using FPGA Cross-Language Libraries",
+  "categories": ["Toolkit/Intel® oneAPI Base Toolkit/FPGA/Tutorials"],
+  "description": "Tutorial demonstrating how to create FPGA libraries and to incorporate them in a DPC++ project",
+  "toolchain": ["dpcpp"],
+  "os": ["linux"],
+  "targetDevice": ["FPGA"],
+  "builder": ["cmake"],
+  "languages": [{"cpp":{}}],
+  "ciTests": {
+    "linux": [
+      {
+        "id": "fpga_emu",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make fpga_emu",
+          "./use_library.fpga_emu"
+        ]
+      },
+      {
+        "id": "report",
+        "steps": [
+          "mkdir build",
+          "cd build",
+          "cmake ..",
+          "make report"
+        ]
+      }
+    ]
+  }
+}
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/CMakeLists.txt b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/CMakeLists.txt
new file mode 100755
index 0000000000..0f6889708b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/CMakeLists.txt
@@ -0,0 +1,133 @@
+set(SOURCE_FILE use_library.cpp)
+set(HEADER_FILE lib.hpp)
+set(TARGET_NAME use_library)
+set(EMULATOR_TARGET ${TARGET_NAME}.fpga_emu)
+set(FPGA_TARGET ${TARGET_NAME}.fpga)
+set(REPORT_TARGET ${TARGET_NAME}_report.a)
+
+# Intel supported FPGA Boards and their names
+set(A10_PAC_BOARD_NAME "intel_a10gx_pac:pac_a10")
+set(S10_PAC_BOARD_NAME "intel_s10sx_pac:pac_s10")
+
+# Assume target is the Intel(R) PAC with Intel Arria(R) 10 GX FPGA 
+SET(_FPGA_BOARD ${A10_PAC_BOARD_NAME})
+
+# Check if target is the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA
+IF (NOT DEFINED FPGA_BOARD)
+    MESSAGE(STATUS "\tFPGA_BOARD was not specified. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for more information on how to run the design on the Intel(R) PAC with Intel Stratix(R) 10 SX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${A10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA.")
+
+ELSEIF(FPGA_BOARD STREQUAL ${S10_PAC_BOARD_NAME})
+    MESSAGE(STATUS "\tConfiguring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Stratix(R) 10 SX FPGA.")
+    SET(_FPGA_BOARD ${S10_PAC_BOARD_NAME})
+
+ELSE()
+    MESSAGE(STATUS "\tAn invalid board name was passed in using the FPGA_BOARD flag. Configuring the design to run on the Intel(R) Programmable Acceleration Card (PAC) with Intel Arria(R) 10 GX FPGA. Please refer to the README for the list of valid board names.")
+ENDIF()
+
+set(HLS_SOURCE lib_hls.cpp) 
+set(HLS_SOURCE_OBJECT lib_hls.o)
+
+set(OCL_SOURCE lib_ocl.cl) 
+set(OCL_SOURCE_OBJECT lib_ocl.o)
+
+set(SYCL_SOURCE lib_sycl.cpp) 
+set(SYCL_SOURCE_OBJECT lib_sycl.o)
+
+set(RTL_C_MODEL lib_rtl_model.cpp)
+set(RTL_SPEC lib_rtl_spec.xml) 
+set(RTL_V lib_rtl.v) 
+set(RTL_SOURCE_OBJECT lib_rtl.o)
+
+set(LIBRARY_ARCHIVE lib.a)
+
+set(LIBRARY_DEVICE_LINK_FLAGS "${LIBRARY_ARCHIVE}")
+set(LIBRARY_HOST_LINK_FLAGS "${HLS_SOURCE_OBJECT} ${OCL_SOURCE_OBJECT} ${SYCL_SOURCE_OBJECT} ${RTL_SOURCE_OBJECT}")
+
+set(HARDWARE_COMPILE_FLAGS "-fintelfpga")
+# use cmake -D USER_HARDWARE_FLAGS=<flags> to set extra flags for FPGA backend compilation
+set(HARDWARE_LINK_FLAGS "-fintelfpga -Xshardware -Xsboard=${_FPGA_BOARD} ${LIBRARY_DEVICE_LINK_FLAGS} ${USER_HARDWARE_FLAGS}")
+
+set(EMULATOR_COMPILE_FLAGS "-fintelfpga -DFPGA_EMULATOR")
+set(EMULATOR_LINK_FLAGS "-fintelfpga ${LIBRARY_DEVICE_LINK_FLAGS}")
+
+#create hls source object
+add_custom_target(
+    create_hls_source_object 
+    COMMAND fpga_crossgen ${HLS_SOURCE} --source hls --target sycl -o ${HLS_SOURCE_OBJECT} ${CMAKE_CXX_FLAGS}
+    )
+
+#create ocl source object
+add_custom_target(
+    create_ocl_source_object
+    COMMAND fpga_crossgen ${OCL_SOURCE} --source ocl --target sycl -o ${OCL_SOURCE_OBJECT}
+    )
+
+#create sycl source object
+add_custom_target(
+    create_sycl_source_object 
+    COMMAND fpga_crossgen ${SYCL_SOURCE} --source sycl --target sycl -o ${SYCL_SOURCE_OBJECT} ${CMAKE_CXX_FLAGS}
+    )
+
+#create rtl source object
+add_custom_target(
+    create_rtl_source_object
+    COMMAND fpga_crossgen ${RTL_SPEC} --emulation_model ${RTL_C_MODEL} --target sycl -o ${RTL_SOURCE_OBJECT}
+    )
+
+#create library achive
+add_custom_target(
+    create_library_archive 
+    COMMAND fpga_libtool ${HLS_SOURCE_OBJECT} ${OCL_SOURCE_OBJECT} ${SYCL_SOURCE_OBJECT} ${RTL_SOURCE_OBJECT} --target sycl --create ${LIBRARY_ARCHIVE} 
+    DEPENDS create_hls_source_object create_ocl_source_object create_sycl_source_object create_rtl_source_object
+    )
+
+# fpga emulator
+set(SOURCE_OBJ_FILE_EMU ${SOURCE_FILE}.emu.o)
+add_custom_target(fpga_emu DEPENDS ${EMULATOR_TARGET})
+separate_arguments(EMULATOR_COMPILE_FLAGS_LIST UNIX_COMMAND "${EMULATOR_COMPILE_FLAGS}")
+add_custom_command(OUTPUT ${SOURCE_OBJ_FILE_EMU}
+                   COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${SOURCE_FILE} ${EMULATOR_COMPILE_FLAGS_LIST} -c -o ${SOURCE_OBJ_FILE_EMU}
+                   DEPENDS ${SOURCE_FILE} ${HEADER_FILE})
+separate_arguments(EMULATOR_LINK_FLAGS_LIST UNIX_COMMAND "${EMULATOR_LINK_FLAGS}")
+add_custom_command(OUTPUT ${EMULATOR_TARGET}
+                   COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${SOURCE_OBJ_FILE_EMU} ${EMULATOR_LINK_FLAGS_LIST} -o ${CMAKE_BINARY_DIR}/${EMULATOR_TARGET}
+                   DEPENDS ${SOURCE_OBJ_FILE_EMU} create_library_archive)
+
+# fpga
+set(SOURCE_OBJ_FILE_FPGA ${SOURCE_FILE}.fpga.o)
+add_custom_target(fpga DEPENDS ${FPGA_TARGET})
+separate_arguments(HARDWARE_COMPILE_FLAGS_LIST UNIX_COMMAND "${HARDWARE_COMPILE_FLAGS}")
+add_custom_command(OUTPUT ${SOURCE_OBJ_FILE_FPGA}
+                   COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${SOURCE_FILE} ${HARDWARE_COMPILE_FLAGS_LIST} -c -o ${SOURCE_OBJ_FILE_FPGA}
+                   DEPENDS ${SOURCE_FILE} ${HEADER_FILE})
+separate_arguments(HARDWARE_LINK_FLAGS_LIST UNIX_COMMAND "${HARDWARE_LINK_FLAGS}")
+add_custom_command(OUTPUT ${FPGA_TARGET}
+                   COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${SOURCE_OBJ_FILE_FPGA} ${HARDWARE_LINK_FLAGS_LIST} -o ${CMAKE_BINARY_DIR}/${FPGA_TARGET}
+                   DEPENDS ${SOURCE_OBJ_FILE_FPGA} create_library_archive)
+
+# report
+set(SOURCE_OBJ_FILE_REPORT ${SOURCE_FILE}.report.o)
+add_custom_target(report DEPENDS ${REPORT_TARGET})
+add_custom_command(OUTPUT ${SOURCE_OBJ_FILE_REPORT}
+                   COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${SOURCE_FILE} ${HARDWARE_COMPILE_FLAGS_LIST} -c -o ${SOURCE_OBJ_FILE_REPORT}
+                   DEPENDS ${SOURCE_FILE} ${HEADER_FILE})
+add_custom_command(OUTPUT ${REPORT_TARGET} 
+                   COMMAND ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_FLAGS} ${SOURCE_OBJ_FILE_REPORT} ${HARDWARE_LINK_FLAGS_LIST} -fsycl-link -o ${CMAKE_BINARY_DIR}/${REPORT_TARGET}
+                   DEPENDS ${SOURCE_OBJ_FILE_REPORT} create_library_archive)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SOURCE_FILE} ${SOURCE_FILE} COPYONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${HEADER_FILE} ${HEADER_FILE} COPYONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${HLS_SOURCE} ${HLS_SOURCE} COPYONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${OCL_SOURCE} ${OCL_SOURCE} COPYONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SYCL_SOURCE} ${SYCL_SOURCE} COPYONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${RTL_SPEC} ${RTL_SPEC} COPYONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${RTL_C_MODEL} ${RTL_C_MODEL} COPYONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${RTL_V} ${RTL_V} COPYONLY)
+
+# run
+add_custom_target(run
+                  COMMAND ../${TARGET_NAME}_emu.fpga
+                  DEPENDS ${TARGET_NAME}_emu.fpga)
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib.hpp b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib.hpp
new file mode 100755
index 0000000000..968b1139c4
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib.hpp
@@ -0,0 +1,9 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+SYCL_EXTERNAL float HlsSqrtf(float);
+SYCL_EXTERNAL extern "C" float OclSquare(float);
+SYCL_EXTERNAL float SyclSquare(float);
+SYCL_EXTERNAL extern "C" unsigned RtlByteswap(unsigned x);
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_hls.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_hls.cpp
new file mode 100755
index 0000000000..7e488a1271
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_hls.cpp
@@ -0,0 +1,7 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include "HLS/math.h"
+float HlsSqrtf(float x) { return sqrtf(x); }
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_ocl.cl b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_ocl.cl
new file mode 100755
index 0000000000..bf2a1c4930
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_ocl.cl
@@ -0,0 +1,6 @@
+//==============================================================
+// Copyright © 2019 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+float OclSquare(float x) { return x * x; }
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl.v b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl.v
new file mode 100755
index 0000000000..28c1ad0f96
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl.v
@@ -0,0 +1,18 @@
+`timescale 1 ps / 1 ps
+ 
+module byteswap_uint (
+  input   clock,
+  input   resetn,
+  input   ivalid, 
+  input   iready,
+  output  ovalid,
+  output  oready,
+  input   [31:0]  datain,
+  output  [31:0]  dataout);
+ 
+  assign  ovalid = 1'b1;
+  assign  oready = 1'b1;
+  // clk, ivalid, iready, resetn are ignored
+  assign dataout = {datain[15:0], datain[31:16]};
+ 
+endmodule
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl_model.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl_model.cpp
new file mode 100755
index 0000000000..1c74a74b8b
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl_model.cpp
@@ -0,0 +1,6 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+extern "C" unsigned RtlByteswap(unsigned x) { return x << 16 | x >> 16; }
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl_spec.xml b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl_spec.xml
new file mode 100755
index 0000000000..361ef11e8a
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_rtl_spec.xml
@@ -0,0 +1,25 @@
+<RTL_SPEC>
+  <FUNCTION name="RtlByteswap" module="byteswap_uint">
+    <ATTRIBUTES>
+      <IS_STALL_FREE value="yes"/>
+      <IS_FIXED_LATENCY value="yes"/>
+      <EXPECTED_LATENCY value="0"/>
+      <CAPACITY value="0"/>
+      <HAS_SIDE_EFFECTS value="no"/>
+      <ALLOW_MERGING value="yes"/>
+    </ATTRIBUTES>
+    <INTERFACE>
+      <AVALON port="clock" type="clock"/>
+      <AVALON port="resetn" type="resetn"/>
+      <AVALON port="ivalid" type="ivalid"/>
+      <AVALON port="iready" type="iready"/>
+      <AVALON port="ovalid" type="ovalid"/>
+      <AVALON port="oready" type="oready"/>
+      <INPUT  port="datain" width="32"/>
+      <OUTPUT port="dataout" width="32"/>
+    </INTERFACE>
+    <REQUIREMENTS>
+      <FILE name="lib_rtl.v" />
+    </REQUIREMENTS>
+  </FUNCTION>
+</RTL_SPEC>
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_sycl.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_sycl.cpp
new file mode 100755
index 0000000000..dcda51b31d
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/lib_sycl.cpp
@@ -0,0 +1,7 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+SYCL_EXTERNAL float SyclSquare(float x) { return x * x; }
diff --git a/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/use_library.cpp b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/use_library.cpp
new file mode 100755
index 0000000000..6af7f26437
--- /dev/null
+++ b/DirectProgramming/DPC++FPGA/Tutorials/Tools/use_library/src/use_library.cpp
@@ -0,0 +1,89 @@
+//==============================================================
+// Copyright Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+#include <CL/sycl.hpp>
+#include <CL/sycl/intel/fpga_extensions.hpp>
+#include "dpc_common.hpp"
+#include "lib.hpp"
+
+using namespace sycl;
+
+// Values used as input to the kernel
+constexpr float kA = 2.0f;
+constexpr float kB = 3.0f;
+
+// Forward declaration of the kernel name
+// (This will become unnecessary in a future compiler version.)
+class KernelCompute;
+
+int main() {
+  unsigned result = 0;
+
+  // Select either the FPGA emulator (CPU) or FPGA device
+#if defined(FPGA_EMULATOR)
+  intel::fpga_emulator_selector device_selector;
+#else
+  intel::fpga_selector device_selector;
+#endif
+
+  try {
+    queue q(device_selector, dpc_common::exception_handler);
+
+    // The scalar inputs are passed to the kernel using the lambda capture,
+    // but a SYCL buffer must be used to return a scalar from the kernel.
+    buffer<unsigned, 1> buffer_c(&result, 1);
+
+    q.submit([&](handler &h) {
+
+      // Accessor to the scalar result
+      auto accessor_c = buffer_c.get_access<access::mode::discard_write>(h);
+
+      // Kernel
+      h.single_task<class KernelCompute>([=]() {
+
+        // OclSquare is an OpenCL function, defined in lib_ocl.cl.
+        float a_sq = OclSquare(kA);
+
+        // HlsSqrtf is an Intel HLS component, defined in lib_hls.cpp.
+        // (Intel HLS is a C++ based High Level Synthesis language for FPGA.)
+        float a_sq_sqrt = HlsSqrtf(a_sq);
+        
+        // SyclSquare is a SYCL library function, defined in lib_sycl.cpp.
+        float b_sq = SyclSquare(kB);
+
+        // RtlByteswap is an RTL library.
+        //  - When compiled for FPGA, Verilog module byteswap_uint in lib_rtl.v
+        //    is instantiated in the datapath by the compiler.
+        //  - When compiled for FPGA emulator (CPU), the C model of RtlByteSwap 
+        //    in lib_rtl_model.cpp is used instead.
+        accessor_c[0] = RtlByteswap((unsigned)(a_sq_sqrt + b_sq));
+      });
+    });
+  } catch (sycl::exception const &e) {
+    // Catches exceptions in the host code
+    std::cout << "Caught a SYCL host exception:\n" << e.what() << "\n";
+
+    // Most likely the runtime couldn't find FPGA hardware!
+    if (e.get_cl_code() == CL_DEVICE_NOT_FOUND) {
+      std::cout << "If you are targeting an FPGA, please ensure that your "
+                   "system has a correctly configured FPGA board.\n";
+      std::cout << "If you are targeting the FPGA emulator, compile with "
+                   "-DFPGA_EMULATOR.\n";
+    }
+    std::terminate();
+  }
+
+  // Compute the expected "golden" result
+  unsigned gold = sqrt(kA * kA) + (kB * kB);
+  gold = gold << 16 | gold >> 16;
+
+  // Check the results
+  if (result != gold) {
+    std::cout << "FAILED: result is incorrect!\n";
+    return -1;
+  }
+  std::cout << "PASSED: result is correct!\n";
+  return 0;
+}
diff --git a/DirectProgramming/FPGA/.gitkeep b/DirectProgramming/FPGA/.gitkeep
deleted file mode 100644
index e69de29bb2..0000000000