diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt
index 2be7bcff6a..16fc363b00 100644
--- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt
+++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt
@@ -19,6 +19,6 @@ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -ltbb -lsycl")
 add_executable (dpc_reduce src/main.cpp)
 
 add_custom_target (run
-        COMMAND dpc_reduce
+        COMMAND CL_CONFIG_CPU_FORCE_PRIVATE_MEM_SIZE=16MB  dpc_reduce
         WORKING_DIRECTORY ${CMAKE_PROJECT_DIR}
 )
diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md
index 212b4011ce..ca20f04511 100644
--- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md
+++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md
@@ -36,6 +36,10 @@ Using Data Parallel C++, the code sample runs multiple MPI ranks to distribute t
 calculation of the number Pi. Each rank offloads the computation to an accelerator
 (GPU/CPU) using Intel DPC++ compiler to compute a partial compution of the number Pi.
 
+If you run the sample on a CPU as your default device,  you may need to increase 
+the memory allocation for openCL.  You can do this by setting an environment variable, 
+    "CL_CONFIG_CPU_FORCE_PRIVATE_MEM_SIZE=16MB
+
 
 ## Key Implementation Details
 The basic DPC++ implementation explained in the code includes accessor,
diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json
index 5302acc385..bcf36cf1bf 100644
--- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json
+++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json
@@ -20,7 +20,7 @@
  			   "cd build",
  			   "cmake ..",
  			   "make", 
-               "./dpc_reduce"
+               "CL_CONFIG_CPU_FORCE_PRIVATE_MEM_SIZE=16MB  ./src/dpc_reduce"
               ]
             }
       ]
diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp
index 6d154fc3f8..ab9b72a39c 100755
--- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp
+++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp
@@ -167,108 +167,126 @@ struct slice_area {
   };
 };
 
-// a way to get value_type from both accessors and USM that is needed for
-// transform_init
+
+// a way to get value_type from both accessors and USM that is needed for transform_init
 template <typename Unknown>
-struct accessor_traits {};
-
-template <typename T, int Dim, sycl::access::mode AccMode,
-          sycl::access::target AccTarget, sycl::access::placeholder Placeholder>
-struct accessor_traits<
-    sycl::accessor<T, Dim, AccMode, AccTarget, Placeholder>> {
-  using value_type = typename sycl::accessor<T, Dim, AccMode, AccTarget,
-                                             Placeholder>::value_type;
+struct accessor_traits
+{
+};
+
+template <typename T, int Dim, sycl::access::mode AccMode, sycl::access::target AccTarget,
+          sycl::access::placeholder Placeholder>
+struct accessor_traits<sycl::accessor<T, Dim, AccMode, AccTarget, Placeholder>>
+{
+    using value_type = typename sycl::accessor<T, Dim, AccMode, AccTarget, Placeholder>::value_type;
 };
 
 template <typename RawArrayValueType>
-struct accessor_traits<RawArrayValueType*> {
-  using value_type = RawArrayValueType;
+struct accessor_traits<RawArrayValueType*>
+{
+    using value_type = RawArrayValueType;
 };
 
 // calculate shift where we should start processing on current item
-template <typename NDItemId, typename GlobalIdx, typename SizeNIter,
-          typename SizeN>
-SizeN calc_shift(const NDItemId item_id, const GlobalIdx global_idx,
-                 SizeNIter& n_iter, const SizeN n) {
-  auto global_range_size = item_id.get_global_range().size();
-
-  auto start = n_iter * global_idx;
-  auto global_shift = global_idx + n_iter * global_range_size;
-  if (n_iter > 0 && global_shift > n) {
-    start += n % global_range_size - global_idx;
-  } else if (global_shift < n) {
-    n_iter++;
-  }
-  return start;
-}
-
-template <typename ExecutionPolicy, typename Operation1, typename Operation2>
-struct transform_init {
-  Operation1 binary_op;
-  Operation2 unary_op;
-
-  template <typename NDItemId, typename GlobalIdx, typename Size,
-            typename AccLocal, typename... Acc>
-  void operator()(const NDItemId item_id, const GlobalIdx global_idx, Size n,
-                  AccLocal& local_mem, const Acc&... acc) {
-    auto local_idx = item_id.get_local_id(0);
+template <typename NDItemId, typename GlobalIdx, typename SizeNIter, typename SizeN>
+SizeN
+calc_shift(const NDItemId item_id, const GlobalIdx global_idx, SizeNIter& n_iter, const SizeN n)
+{
     auto global_range_size = item_id.get_global_range().size();
-    auto n_iter = n / global_range_size;
-    auto start = calc_shift(item_id, global_idx, n_iter, n);
-    auto shifted_global_idx = global_idx + start;
 
-    typename accessor_traits<AccLocal>::value_type res;
-    if (global_idx < n) {
-      res = unary_op(shifted_global_idx, acc...);
+    auto start = n_iter * global_idx;
+    auto global_shift = global_idx + n_iter * global_range_size;
+    if (n_iter > 0 && global_shift > n)
+    {
+        start += n % global_range_size - global_idx;
     }
-    // Add neighbour to the current local_mem
-    for (decltype(n_iter) i = 1; i < n_iter; ++i) {
-      res = binary_op(res, unary_op(shifted_global_idx + i, acc...));
+    else if (global_shift < n)
+    {
+        n_iter++;
     }
-    if (global_idx < n) {
-      local_mem[local_idx] = res;
+    return start;
+}
+
+
+template <typename ExecutionPolicy, typename Operation1, typename Operation2>
+struct transform_init
+{
+    Operation1 binary_op;
+    Operation2 unary_op;
+
+    template <typename NDItemId, typename GlobalIdx, typename Size, typename AccLocal, typename... Acc>
+    void
+    operator()(const NDItemId item_id, const GlobalIdx global_idx, Size n, AccLocal& local_mem,
+               const Acc&... acc)
+    {
+        auto local_idx = item_id.get_local_id(0);
+        auto global_range_size = item_id.get_global_range().size();
+        auto n_iter = n / global_range_size;
+        auto start = calc_shift(item_id, global_idx, n_iter, n);
+        auto shifted_global_idx = global_idx + start;
+
+        typename accessor_traits<AccLocal>::value_type res;
+        if (global_idx < n)
+        {
+            res = unary_op(shifted_global_idx, acc...);
+        }
+        // Add neighbour to the current local_mem
+        for (decltype(n_iter) i = 1; i < n_iter; ++i)
+        {
+            res = binary_op(res, unary_op(shifted_global_idx + i, acc...));
+        }
+        if (global_idx < n)
+        {
+            local_mem[local_idx] = res;
+        }
     }
-  }
 };
 
+
 // Reduce on local memory
 template <typename ExecutionPolicy, typename BinaryOperation1, typename Tp>
-struct reduce {
-  BinaryOperation1 bin_op1;
-
-  template <typename NDItemId, typename GlobalIdx, typename Size,
-            typename AccLocal>
-  Tp operator()(const NDItemId item_id, const GlobalIdx global_idx,
-                const Size n, AccLocal& local_mem) {
-    auto local_idx = item_id.get_local_id(0);
-    auto group_size = item_id.get_local_range().size();
-
-    auto k = 1;
-    do {
-      item_id.barrier(sycl::access::fence_space::local_space);
-      if (local_idx % (2 * k) == 0 && local_idx + k < group_size &&
-          global_idx < n && global_idx + k < n) {
-        local_mem[local_idx] =
-            bin_op1(local_mem[local_idx], local_mem[local_idx + k]);
-      }
-      k *= 2;
-    } while (k < group_size);
-    return local_mem[local_idx];
-  }
+struct reduce
+{
+    BinaryOperation1 bin_op1;
+
+    template <typename NDItemId, typename GlobalIdx, typename Size, typename AccLocal>
+    Tp
+    operator()(const NDItemId item_id, const GlobalIdx global_idx, const Size n, AccLocal& local_mem)
+    {
+        auto local_idx = item_id.get_local_id(0);
+        auto group_size = item_id.get_local_range().size();
+
+        auto k = 1;
+        do
+        {
+            item_id.barrier(sycl::access::fence_space::local_space);
+            if (local_idx % (2 * k) == 0 && local_idx + k < group_size && global_idx < n &&
+                global_idx + k < n)
+            {
+                local_mem[local_idx] = bin_op1(local_mem[local_idx], local_mem[local_idx + k]);
+            }
+            k *= 2;
+        } while (k < group_size);
+        return local_mem[local_idx];
+    }
 };
 
+
 // walk through the data
 template <typename ExecutionPolicy, typename F>
-struct walk_n {
-  F f;
-
-  template <typename ItemId, typename... Ranges>
-  auto operator()(const ItemId idx, Ranges&&... rngs)
-      -> decltype(f(rngs[idx]...)) {
-    return f(rngs[idx]...);
-  }
+struct walk_n
+{
+    F f;
+
+    template <typename ItemId, typename... Ranges>
+    auto
+    operator()(const ItemId idx, Ranges&&... rngs) -> decltype(f(rngs[idx]...))
+    {
+        return f(rngs[idx]...);
+    }
 };
 
+
 // This option uses a parallel for to fill the buffer and then
 // uses a tranform_init with plus/no_op and then
 // a local reduction then global reduction.
@@ -301,12 +319,12 @@ float calc_pi_dpstd_native3(size_t num_steps, int groups, Policy&& policy) {
   // In this example we have done the calculation and filled the buffer above
   // The way transform_init works is that you need to have the value already
   // populated in the buffer.
-  auto tf_init = transform_init<Policy, std::plus<float>, Functor>{
-      std::plus<float>(), Functor{my_no_op()}};
+  auto tf_init = transform_init<Policy, std::plus<float>,
+                   Functor>{std::plus<float>(), Functor{my_no_op()}};
 
   auto combine = std::plus<float>();
-  auto brick_reduce =
-      reduce<Policy, std::plus<float>, float>{std::plus<float>()};
+  auto brick_reduce = reduce<Policy, std::plus<float>, float>{
+          std::plus<float>()};
   auto workgroup_size =
       policy.queue()
           .get_device()
@@ -336,8 +354,8 @@ float calc_pi_dpstd_native3(size_t num_steps, int groups, Policy&& policy) {
                        [=](nd_item<1> item_id) mutable {
                          auto global_idx = item_id.get_global_id(0);
                          // 1. Initialization (transform part).
-                         tf_init(item_id, global_idx, num_steps, temp_buf_local,
-                                 access_buf);
+                         tf_init(item_id, global_idx, num_steps,
+                                 temp_buf_local, access_buf);
                          // 2. Reduce within work group
                          float local_result = brick_reduce(
                              item_id, global_idx, num_steps, temp_buf_local);
@@ -402,12 +420,13 @@ float calc_pi_dpstd_native4(size_t num_steps, int groups, Policy&& policy) {
   // The buffer has 1...num it at and now we will use that as an input
   // to the slice structue which will calculate the area of each
   // rectangle.
-  auto tf_init = transform_init<Policy, std::plus<float>, Functor2>{
-      std::plus<float>(), Functor2{slice_area(num_steps)}};
+  auto tf_init = transform_init<Policy, std::plus<float>,
+                                                 Functor2>{
+          std::plus<float>(), Functor2{slice_area(num_steps)}};
 
   auto combine = std::plus<float>();
-  auto brick_reduce =
-      reduce<Policy, std::plus<float>, float>{std::plus<float>()};
+  auto brick_reduce = reduce<Policy, std::plus<float>, float>{
+          std::plus<float>()};
 
   // get workgroup_size from the device
   auto workgroup_size =
@@ -446,8 +465,8 @@ float calc_pi_dpstd_native4(size_t num_steps, int groups, Policy&& policy) {
                          auto global_idx = item_id.get_global_id(0);
                          // 1. Initialization (transform part). Fill local
                          // memory
-                         tf_init(item_id, global_idx, num_steps, temp_buf_local,
-                                 access_buf);
+                         tf_init(item_id, global_idx, num_steps,
+                                 temp_buf_local, access_buf);
                          // 2. Reduce within work group
                          float local_result = brick_reduce(
                              item_id, global_idx, num_steps, temp_buf_local);