diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt index 2be7bcff6a..16fc363b00 100644 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/CMakeLists.txt @@ -19,6 +19,6 @@ set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -ltbb -lsycl") add_executable (dpc_reduce src/main.cpp) add_custom_target (run - COMMAND dpc_reduce + COMMAND CL_CONFIG_CPU_FORCE_PRIVATE_MEM_SIZE=16MB dpc_reduce WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} ) diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md index 212b4011ce..ca20f04511 100644 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/README.md @@ -36,6 +36,10 @@ Using Data Parallel C++, the code sample runs multiple MPI ranks to distribute t calculation of the number Pi. Each rank offloads the computation to an accelerator (GPU/CPU) using Intel DPC++ compiler to compute a partial compution of the number Pi. +If you run the sample on a CPU as your default device, you may need to increase +the memory allocation for openCL. You can do this by setting an environment variable, + "CL_CONFIG_CPU_FORCE_PRIVATE_MEM_SIZE=16MB + ## Key Implementation Details The basic DPC++ implementation explained in the code includes accessor, diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json index 5302acc385..bcf36cf1bf 100644 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/sample.json @@ -20,7 +20,7 @@ "cd build", "cmake ..", "make", - "./dpc_reduce" + "CL_CONFIG_CPU_FORCE_PRIVATE_MEM_SIZE=16MB ./src/dpc_reduce" ] } ] diff --git a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp index 6d154fc3f8..ab9b72a39c 100755 --- a/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp +++ b/DirectProgramming/DPC++/ParallelPatterns/dpc_reduce/src/main.cpp @@ -167,108 +167,126 @@ struct slice_area { }; }; -// a way to get value_type from both accessors and USM that is needed for -// transform_init + +// a way to get value_type from both accessors and USM that is needed for transform_init template -struct accessor_traits {}; - -template -struct accessor_traits< - sycl::accessor> { - using value_type = typename sycl::accessor::value_type; +struct accessor_traits +{ +}; + +template +struct accessor_traits> +{ + using value_type = typename sycl::accessor::value_type; }; template -struct accessor_traits { - using value_type = RawArrayValueType; +struct accessor_traits +{ + using value_type = RawArrayValueType; }; // calculate shift where we should start processing on current item -template -SizeN calc_shift(const NDItemId item_id, const GlobalIdx global_idx, - SizeNIter& n_iter, const SizeN n) { - auto global_range_size = item_id.get_global_range().size(); - - auto start = n_iter * global_idx; - auto global_shift = global_idx + n_iter * global_range_size; - if (n_iter > 0 && global_shift > n) { - start += n % global_range_size - global_idx; - } else if (global_shift < n) { - n_iter++; - } - return start; -} - -template -struct transform_init { - Operation1 binary_op; - Operation2 unary_op; - - template - void operator()(const NDItemId item_id, const GlobalIdx global_idx, Size n, - AccLocal& local_mem, const Acc&... acc) { - auto local_idx = item_id.get_local_id(0); +template +SizeN +calc_shift(const NDItemId item_id, const GlobalIdx global_idx, SizeNIter& n_iter, const SizeN n) +{ auto global_range_size = item_id.get_global_range().size(); - auto n_iter = n / global_range_size; - auto start = calc_shift(item_id, global_idx, n_iter, n); - auto shifted_global_idx = global_idx + start; - typename accessor_traits::value_type res; - if (global_idx < n) { - res = unary_op(shifted_global_idx, acc...); + auto start = n_iter * global_idx; + auto global_shift = global_idx + n_iter * global_range_size; + if (n_iter > 0 && global_shift > n) + { + start += n % global_range_size - global_idx; } - // Add neighbour to the current local_mem - for (decltype(n_iter) i = 1; i < n_iter; ++i) { - res = binary_op(res, unary_op(shifted_global_idx + i, acc...)); + else if (global_shift < n) + { + n_iter++; } - if (global_idx < n) { - local_mem[local_idx] = res; + return start; +} + + +template +struct transform_init +{ + Operation1 binary_op; + Operation2 unary_op; + + template + void + operator()(const NDItemId item_id, const GlobalIdx global_idx, Size n, AccLocal& local_mem, + const Acc&... acc) + { + auto local_idx = item_id.get_local_id(0); + auto global_range_size = item_id.get_global_range().size(); + auto n_iter = n / global_range_size; + auto start = calc_shift(item_id, global_idx, n_iter, n); + auto shifted_global_idx = global_idx + start; + + typename accessor_traits::value_type res; + if (global_idx < n) + { + res = unary_op(shifted_global_idx, acc...); + } + // Add neighbour to the current local_mem + for (decltype(n_iter) i = 1; i < n_iter; ++i) + { + res = binary_op(res, unary_op(shifted_global_idx + i, acc...)); + } + if (global_idx < n) + { + local_mem[local_idx] = res; + } } - } }; + // Reduce on local memory template -struct reduce { - BinaryOperation1 bin_op1; - - template - Tp operator()(const NDItemId item_id, const GlobalIdx global_idx, - const Size n, AccLocal& local_mem) { - auto local_idx = item_id.get_local_id(0); - auto group_size = item_id.get_local_range().size(); - - auto k = 1; - do { - item_id.barrier(sycl::access::fence_space::local_space); - if (local_idx % (2 * k) == 0 && local_idx + k < group_size && - global_idx < n && global_idx + k < n) { - local_mem[local_idx] = - bin_op1(local_mem[local_idx], local_mem[local_idx + k]); - } - k *= 2; - } while (k < group_size); - return local_mem[local_idx]; - } +struct reduce +{ + BinaryOperation1 bin_op1; + + template + Tp + operator()(const NDItemId item_id, const GlobalIdx global_idx, const Size n, AccLocal& local_mem) + { + auto local_idx = item_id.get_local_id(0); + auto group_size = item_id.get_local_range().size(); + + auto k = 1; + do + { + item_id.barrier(sycl::access::fence_space::local_space); + if (local_idx % (2 * k) == 0 && local_idx + k < group_size && global_idx < n && + global_idx + k < n) + { + local_mem[local_idx] = bin_op1(local_mem[local_idx], local_mem[local_idx + k]); + } + k *= 2; + } while (k < group_size); + return local_mem[local_idx]; + } }; + // walk through the data template -struct walk_n { - F f; - - template - auto operator()(const ItemId idx, Ranges&&... rngs) - -> decltype(f(rngs[idx]...)) { - return f(rngs[idx]...); - } +struct walk_n +{ + F f; + + template + auto + operator()(const ItemId idx, Ranges&&... rngs) -> decltype(f(rngs[idx]...)) + { + return f(rngs[idx]...); + } }; + // This option uses a parallel for to fill the buffer and then // uses a tranform_init with plus/no_op and then // a local reduction then global reduction. @@ -301,12 +319,12 @@ float calc_pi_dpstd_native3(size_t num_steps, int groups, Policy&& policy) { // In this example we have done the calculation and filled the buffer above // The way transform_init works is that you need to have the value already // populated in the buffer. - auto tf_init = transform_init, Functor>{ - std::plus(), Functor{my_no_op()}}; + auto tf_init = transform_init, + Functor>{std::plus(), Functor{my_no_op()}}; auto combine = std::plus(); - auto brick_reduce = - reduce, float>{std::plus()}; + auto brick_reduce = reduce, float>{ + std::plus()}; auto workgroup_size = policy.queue() .get_device() @@ -336,8 +354,8 @@ float calc_pi_dpstd_native3(size_t num_steps, int groups, Policy&& policy) { [=](nd_item<1> item_id) mutable { auto global_idx = item_id.get_global_id(0); // 1. Initialization (transform part). - tf_init(item_id, global_idx, num_steps, temp_buf_local, - access_buf); + tf_init(item_id, global_idx, num_steps, + temp_buf_local, access_buf); // 2. Reduce within work group float local_result = brick_reduce( item_id, global_idx, num_steps, temp_buf_local); @@ -402,12 +420,13 @@ float calc_pi_dpstd_native4(size_t num_steps, int groups, Policy&& policy) { // The buffer has 1...num it at and now we will use that as an input // to the slice structue which will calculate the area of each // rectangle. - auto tf_init = transform_init, Functor2>{ - std::plus(), Functor2{slice_area(num_steps)}}; + auto tf_init = transform_init, + Functor2>{ + std::plus(), Functor2{slice_area(num_steps)}}; auto combine = std::plus(); - auto brick_reduce = - reduce, float>{std::plus()}; + auto brick_reduce = reduce, float>{ + std::plus()}; // get workgroup_size from the device auto workgroup_size = @@ -446,8 +465,8 @@ float calc_pi_dpstd_native4(size_t num_steps, int groups, Policy&& policy) { auto global_idx = item_id.get_global_id(0); // 1. Initialization (transform part). Fill local // memory - tf_init(item_id, global_idx, num_steps, temp_buf_local, - access_buf); + tf_init(item_id, global_idx, num_steps, + temp_buf_local, access_buf); // 2. Reduce within work group float local_result = brick_reduce( item_id, global_idx, num_steps, temp_buf_local);