From f5f6b3ff15a2e26656221950e8e3fbe85766a0ef Mon Sep 17 00:00:00 2001 From: antonrydahl Date: Thu, 14 Sep 2023 23:43:43 -0700 Subject: [PATCH 1/4] Proof of concept for offloading C++ standard parallel algorithms to the GPU using OpenMP. To enable offloading, compile with -D_LIBCPP_ENABLE_OPENMP_OFFLOAD --- .../pstl_backends/cpu_backends/fill.h | 2 +- .../pstl_backends/cpu_backends/for_each.h | 21 ++- .../pstl_backends/cpu_backends/transform.h | 28 +++- .../cpu_backends/transform_reduce.h | 8 +- libcxx/include/__config | 121 +++++++++++++++++- 5 files changed, 169 insertions(+), 11 deletions(-) diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h index 8b531887c7318..7f66873069703 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h @@ -27,7 +27,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template _LIBCPP_HIDE_FROM_ABI _Index __simd_fill_n(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept { _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED - _PSTL_PRAGMA_SIMD + _PSTL_PRAGMA_SIMD() for (_DifferenceType __i = 0; __i < __n; ++__i) __first[__i] = __value; return __first + __n; diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h index f6f22fdd8713c..1155c070496c8 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h @@ -26,10 +26,29 @@ _LIBCPP_BEGIN_NAMESPACE_STD template _LIBCPP_HIDE_FROM_ABI _Iterator __simd_walk_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept { - _PSTL_PRAGMA_SIMD + _PSTL_OMP_MAP_TO(__first,__n); + _PSTL_PRAGMA_SIMD(__n) for (_DifferenceType __i = 0; __i < __n; ++__i) __f(__first[__i]); + _PSTL_OMP_MAP_FROM(__first,__n); + return __first + __n; +} +/** + * Specialization for std::vector where the base pointer must be extrated to map + * the data to and from the GPU. +*/ + +template +_LIBCPP_HIDE_FROM_ABI std::__wrap_iter __simd_walk_1(std::__wrap_iter __first, _DifferenceType __n, _Function __f) noexcept { + _PSTL_OMP_MAP_TO(__first,__n); + // For std::vector the base pointer of the data buffer needs to be extracted + std::pointer_traits> PT; + T* data = PT.to_address(__first); + _PSTL_PRAGMA_SIMD(__n) + for (_DifferenceType __i = 0; __i < __n; ++__i) + __f(data[__i]); + _PSTL_OMP_MAP_FROM(__first,__n); return __first + __n; } diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h index 0259d8a84bb3f..6611be3b7a448 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h @@ -30,9 +30,33 @@ _LIBCPP_BEGIN_NAMESPACE_STD template _LIBCPP_HIDE_FROM_ABI _Iterator2 __simd_walk_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept { - _PSTL_PRAGMA_SIMD + _PSTL_OMP_MAP_TO(__first1,__n); + _PSTL_OMP_MAP_ALLOC(__first2,__n); + _PSTL_PRAGMA_SIMD(__n) for (_DifferenceType __i = 0; __i < __n; ++__i) __f(__first1[__i], __first2[__i]); + _PSTL_OMP_MAP_FROM(__first2,__n); + return __first2 + __n; +} + +/** + * Specialization for std::vector where the base pointer must be extrated to map + * the data to and from the GPU. +*/ + +template +_LIBCPP_HIDE_FROM_ABI std::__wrap_iter +__simd_walk_2(std::__wrap_iter __first1, _DifferenceType __n, std::__wrap_iter __first2, _Function __f) noexcept { + _PSTL_OMP_MAP_TO(__first1,__n); + _PSTL_OMP_MAP_ALLOC(__first2,__n); + std::pointer_traits> PT1; + std::pointer_traits> PT2; + T1* __data1 = PT1.to_address(__first1); + T2* __data2 = PT2.to_address(__first2); + _PSTL_PRAGMA_SIMD(__n) + for (_DifferenceType __i = 0; __i < __n; ++__i) + __f(__data1[__i], __data2[__i]); + _PSTL_OMP_MAP_FROM(__first2,__n); return __first2 + __n; } @@ -72,7 +96,7 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform( template _LIBCPP_HIDE_FROM_ABI _Iterator3 __simd_walk_3( _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept { - _PSTL_PRAGMA_SIMD + _PSTL_PRAGMA_SIMD() for (_DifferenceType __i = 0; __i < __n; ++__i) __f(__first1[__i], __first2[__i], __first3[__i]); return __first3 + __n; diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h index 2afe5c7d10483..885648982409c 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h @@ -57,7 +57,7 @@ __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _Un _Tp* __lane = reinterpret_cast<_Tp*>(__lane_buffer); // initializer - _PSTL_PRAGMA_SIMD + _PSTL_PRAGMA_SIMD() for (_Size __i = 0; __i < __block_size; ++__i) { ::new (__lane + __i) _Tp(__binary_op(__f(__i), __f(__block_size + __i))); } @@ -65,13 +65,13 @@ __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _Un _Size __i = 2 * __block_size; const _Size __last_iteration = __block_size * (__n / __block_size); for (; __i < __last_iteration; __i += __block_size) { - _PSTL_PRAGMA_SIMD + _PSTL_PRAGMA_SIMD() for (_Size __j = 0; __j < __block_size; ++__j) { __lane[__j] = __binary_op(std::move(__lane[__j]), __f(__i + __j)); } } // remainder - _PSTL_PRAGMA_SIMD + _PSTL_PRAGMA_SIMD() for (_Size __j = 0; __j < __n - __last_iteration; ++__j) { __lane[__j] = __binary_op(std::move(__lane[__j]), __f(__last_iteration + __j)); } @@ -80,7 +80,7 @@ __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _Un __init = __binary_op(std::move(__init), std::move(__lane[__j])); } // destroyer - _PSTL_PRAGMA_SIMD + _PSTL_PRAGMA_SIMD() for (_Size __j = 0; __j < __block_size; ++__j) { __lane[__j].~_Tp(); } diff --git a/libcxx/include/__config b/libcxx/include/__config index bf2564e2732ba..087c5c9123551 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1414,8 +1414,118 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c // Enable SIMD for compilers that support OpenMP 4.0 # if (defined(_OPENMP) && _OPENMP >= 201307) +# ifdef _LIBCPP_ENABLE_OPENMP_OFFLOAD +# ifndef _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES +# define _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES 32768 +# endif +#include +#include <__iterator/wrap_iter.h> +# define _PSTL_PRAGMA_DATA_MAP_TO(NAME,LEN) _PSTL_PRAGMA(omp target enter data map(to:NAME[:LEN])) +# define _PSTL_PRAGMA_DATA_MAP_ALLOC(NAME,LEN) _PSTL_PRAGMA(omp target enter data map(alloc:NAME[:LEN])) +# define _PSTL_PRAGMA_DATA_MAP_FROM(NAME,LEN) _PSTL_PRAGMA(omp target update from(NAME[:LEN])) + +template +bool constexpr OMPIsOffloadable(N size) +{ + return size >= _LIBCPP_ENABLE_OPENMP_OFFLOAD_MIN_BYTES; +} + +bool constexpr OMPIsOffloadable(void) +{ + return false; +} + +template +void inline OMPMapToIf(T data,N length,int device = omp_get_default_device()) +{ + // If the data is already present on the device, there is no need + // transfer the data again. + if (omp_target_is_present(data,device)){ + return; + } + // If it is a small amount of data it does not make sense to offload to a + // device + if (!OMPIsOffloadable(length)){ + return; + } + _PSTL_PRAGMA_DATA_MAP_TO(data,length); +} + +template +void inline OMPMapAllocIf(T data,N length,int device = omp_get_default_device()) +{ + // If the data is already present on the device, there is no need + // transfer the data again. + if (omp_target_is_present(data,device)){ + return; + } + // If it is a small amount of data it does not make sense to offload to a + // device + if (!OMPIsOffloadable(length)){ + return; + } + _PSTL_PRAGMA_DATA_MAP_ALLOC(data,length); +} + +template +void inline OMPMapTo(T data,N length) { + OMPMapToIf(data,length); +} + +/** + * Specialization for std::vector +*/ + +template +void inline OMPMapTo(std::__wrap_iter w,N length) { + std::pointer_traits> PT; + T* data = PT.to_address(w); + OMPMapToIf(data,length); +} + +template +void inline OMPMapAlloc(T data,N length) { + OMPMapAllocIf(data,length); +} + +/** + * Specialization for std::vector +*/ + +template +void inline OMPMapAlloc(std::__wrap_iter w,N length) { + std::pointer_traits> PT; + T* data = PT.to_address(w); + OMPMapAllocIf(data,length); +} + +template +void inline OMPMapFrom(T data,N length) { + _PSTL_PRAGMA_DATA_MAP_FROM(data,length); +} + +/** + * Specialization for std::vector +*/ + +template +void inline OMPMapFrom(std::__wrap_iter w,N length) { + std::pointer_traits> PT; + T* data = PT.to_address(w); + _PSTL_PRAGMA_DATA_MAP_FROM(data,length); +} +# define _PSTL_OMP_MAP_TO(DATA,LEN) OMPMapTo(DATA,LEN) +# define _PSTL_OMP_MAP_ALLOC(DATA,LEN) OMPMapAlloc(DATA,LEN) +# define _PSTL_OMP_MAP_FROM(DATA,LEN) OMPMapFrom(DATA,LEN) +# define _PSTL_PRAGMA_SIMD(...) _PSTL_PRAGMA(omp target teams distribute parallel for simd if(OMPIsOffloadable(__VA_ARGS__))) +# else +# define _PSTL_PRAGMA_SIMD(...) _PSTL_PRAGMA(omp simd) +# define _PSTL_OMP_MAP_TO(DATA,LEN) +# define _PSTL_OMP_MAP_ALLOC(DATA,LEN) +# define _PSTL_OMP_MAP_FROM(DATA,LEN) +# endif + # define _PSTL_UDR_PRESENT -# define _PSTL_PRAGMA_SIMD _PSTL_PRAGMA(omp simd) # define _PSTL_PRAGMA_DECLARE_SIMD _PSTL_PRAGMA(omp declare simd) # define _PSTL_PRAGMA_SIMD_REDUCTION(PRM) _PSTL_PRAGMA(omp simd reduction(PRM)) # define _PSTL_PRAGMA_SIMD_SCAN(PRM) _PSTL_PRAGMA(omp simd reduction(inscan, PRM)) @@ -1434,7 +1544,7 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c # elif defined(_LIBCPP_COMPILER_CLANG_BASED) -# define _PSTL_PRAGMA_SIMD _Pragma("clang loop vectorize(enable) interleave(enable)") +# define _PSTL_PRAGMA_SIMD(...) _Pragma("clang loop vectorize(enable) interleave(enable)") # define _PSTL_PRAGMA_DECLARE_SIMD # define _PSTL_PRAGMA_SIMD_REDUCTION(PRM) _Pragma("clang loop vectorize(enable) interleave(enable)") # define _PSTL_PRAGMA_SIMD_SCAN(PRM) _Pragma("clang loop vectorize(enable) interleave(enable)") @@ -1444,7 +1554,7 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c # else // (defined(_OPENMP) && _OPENMP >= 201307) -# define _PSTL_PRAGMA_SIMD +# define _PSTL_PRAGMA_SIMD(...) # define _PSTL_PRAGMA_DECLARE_SIMD # define _PSTL_PRAGMA_SIMD_REDUCTION(PRM) # define _PSTL_PRAGMA_SIMD_SCAN(PRM) @@ -1454,6 +1564,11 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c # endif // (defined(_OPENMP) && _OPENMP >= 201307) +# ifndef _LIBCPP_ENABLE_OPENMP_OFFLOAD +# define _PSTL_OMP_MAP_TO(...) +# define _PSTL_OMP_MAP_FROM(...) +# endif + # define _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED #endif // __cplusplus From 897ee3a2029ee77ec56ac62d4f4e7a409e9308f8 Mon Sep 17 00:00:00 2001 From: antonrydahl Date: Fri, 15 Sep 2023 10:29:04 -0700 Subject: [PATCH 2/4] Clang-formatted for_Each.h and transform.h --- .../pstl_backends/cpu_backends/for_each.h | 15 ++++++++------- .../pstl_backends/cpu_backends/transform.h | 18 +++++++++--------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h index 1155c070496c8..ed336766295cb 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h @@ -26,29 +26,30 @@ _LIBCPP_BEGIN_NAMESPACE_STD template _LIBCPP_HIDE_FROM_ABI _Iterator __simd_walk_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept { - _PSTL_OMP_MAP_TO(__first,__n); + _PSTL_OMP_MAP_TO(__first, __n); _PSTL_PRAGMA_SIMD(__n) for (_DifferenceType __i = 0; __i < __n; ++__i) __f(__first[__i]); - _PSTL_OMP_MAP_FROM(__first,__n); + _PSTL_OMP_MAP_FROM(__first, __n); return __first + __n; } /** * Specialization for std::vector where the base pointer must be extrated to map * the data to and from the GPU. -*/ + */ template -_LIBCPP_HIDE_FROM_ABI std::__wrap_iter __simd_walk_1(std::__wrap_iter __first, _DifferenceType __n, _Function __f) noexcept { - _PSTL_OMP_MAP_TO(__first,__n); - // For std::vector the base pointer of the data buffer needs to be extracted +_LIBCPP_HIDE_FROM_ABI std::__wrap_iter +__simd_walk_1(std::__wrap_iter __first, _DifferenceType __n, _Function __f) noexcept { + _PSTL_OMP_MAP_TO(__first, __n); + // For std::vector the base pointer of the data buffer needs to be extracted std::pointer_traits> PT; T* data = PT.to_address(__first); _PSTL_PRAGMA_SIMD(__n) for (_DifferenceType __i = 0; __i < __n; ++__i) __f(data[__i]); - _PSTL_OMP_MAP_FROM(__first,__n); + _PSTL_OMP_MAP_FROM(__first, __n); return __first + __n; } diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h index 6611be3b7a448..06348b0fc5c2a 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h @@ -30,25 +30,25 @@ _LIBCPP_BEGIN_NAMESPACE_STD template _LIBCPP_HIDE_FROM_ABI _Iterator2 __simd_walk_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept { - _PSTL_OMP_MAP_TO(__first1,__n); - _PSTL_OMP_MAP_ALLOC(__first2,__n); + _PSTL_OMP_MAP_TO(__first1, __n); + _PSTL_OMP_MAP_ALLOC(__first2, __n); _PSTL_PRAGMA_SIMD(__n) for (_DifferenceType __i = 0; __i < __n; ++__i) __f(__first1[__i], __first2[__i]); - _PSTL_OMP_MAP_FROM(__first2,__n); + _PSTL_OMP_MAP_FROM(__first2, __n); return __first2 + __n; } /** * Specialization for std::vector where the base pointer must be extrated to map * the data to and from the GPU. -*/ + */ template -_LIBCPP_HIDE_FROM_ABI std::__wrap_iter -__simd_walk_2(std::__wrap_iter __first1, _DifferenceType __n, std::__wrap_iter __first2, _Function __f) noexcept { - _PSTL_OMP_MAP_TO(__first1,__n); - _PSTL_OMP_MAP_ALLOC(__first2,__n); +_LIBCPP_HIDE_FROM_ABI std::__wrap_iter __simd_walk_2( + std::__wrap_iter __first1, _DifferenceType __n, std::__wrap_iter __first2, _Function __f) noexcept { + _PSTL_OMP_MAP_TO(__first1, __n); + _PSTL_OMP_MAP_ALLOC(__first2, __n); std::pointer_traits> PT1; std::pointer_traits> PT2; T1* __data1 = PT1.to_address(__first1); @@ -56,7 +56,7 @@ __simd_walk_2(std::__wrap_iter __first1, _DifferenceType __n, std::__wrap_i _PSTL_PRAGMA_SIMD(__n) for (_DifferenceType __i = 0; __i < __n; ++__i) __f(__data1[__i], __data2[__i]); - _PSTL_OMP_MAP_FROM(__first2,__n); + _PSTL_OMP_MAP_FROM(__first2, __n); return __first2 + __n; } From 22e51f1ca279908be7c595036546a06a3ef123a2 Mon Sep 17 00:00:00 2001 From: antonrydahl Date: Mon, 18 Sep 2023 13:11:33 -0700 Subject: [PATCH 3/4] Updated OpenMP data mapping to be conservative unless _LIBCPP_OPENMP_OFFLOAD_MAPPED is defined --- libcxx/include/__config | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/libcxx/include/__config b/libcxx/include/__config index 087c5c9123551..f4e5c511cdf67 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1422,7 +1422,7 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c #include <__iterator/wrap_iter.h> # define _PSTL_PRAGMA_DATA_MAP_TO(NAME,LEN) _PSTL_PRAGMA(omp target enter data map(to:NAME[:LEN])) # define _PSTL_PRAGMA_DATA_MAP_ALLOC(NAME,LEN) _PSTL_PRAGMA(omp target enter data map(alloc:NAME[:LEN])) -# define _PSTL_PRAGMA_DATA_MAP_FROM(NAME,LEN) _PSTL_PRAGMA(omp target update from(NAME[:LEN])) +# define _PSTL_PRAGMA_DATA_MAP_FROM(NAME,LEN) _PSTL_PRAGMA(omp target exit data map(from:NAME[:LEN])) template bool constexpr OMPIsOffloadable(N size) @@ -1440,9 +1440,11 @@ void inline OMPMapToIf(T data,N length,int device = omp_get_default_device()) { // If the data is already present on the device, there is no need // transfer the data again. +#ifdef _LIBCPP_OPENMP_OFFLOAD_MAPPED if (omp_target_is_present(data,device)){ return; } +#endif // If it is a small amount of data it does not make sense to offload to a // device if (!OMPIsOffloadable(length)){ @@ -1456,9 +1458,11 @@ void inline OMPMapAllocIf(T data,N length,int device = omp_get_default_device()) { // If the data is already present on the device, there is no need // transfer the data again. +#ifdef _LIBCPP_OPENMP_OFFLOAD_MAPPED if (omp_target_is_present(data,device)){ return; } +#endif // If it is a small amount of data it does not make sense to offload to a // device if (!OMPIsOffloadable(length)){ @@ -1467,6 +1471,24 @@ void inline OMPMapAllocIf(T data,N length,int device = omp_get_default_device()) _PSTL_PRAGMA_DATA_MAP_ALLOC(data,length); } +template +void inline OMPMapFromIf(T data,N length,int device = omp_get_default_device()) +{ + // If the data is already present on the device, there is no need + // transfer the data again. +#ifdef _LIBCPP_OPENMP_OFFLOAD_MAPPED + if (omp_target_is_present(data,device)){ + return; + } +#endif + // If it is a small amount of data it does not make sense to offload to a + // device + if (!OMPIsOffloadable(length)){ + return; + } + _PSTL_PRAGMA_DATA_MAP_FROM(data,length); +} + template void inline OMPMapTo(T data,N length) { OMPMapToIf(data,length); @@ -1501,7 +1523,7 @@ void inline OMPMapAlloc(std::__wrap_iter w,N length) { template void inline OMPMapFrom(T data,N length) { - _PSTL_PRAGMA_DATA_MAP_FROM(data,length); + OMPMapFromIf(data,length); } /** @@ -1512,7 +1534,7 @@ template void inline OMPMapFrom(std::__wrap_iter w,N length) { std::pointer_traits> PT; T* data = PT.to_address(w); - _PSTL_PRAGMA_DATA_MAP_FROM(data,length); + OMPMapFromIf(data,length); } # define _PSTL_OMP_MAP_TO(DATA,LEN) OMPMapTo(DATA,LEN) # define _PSTL_OMP_MAP_ALLOC(DATA,LEN) OMPMapAlloc(DATA,LEN) From 70631edaac5bba8c798d009ae2de3ec62f9e41e5 Mon Sep 17 00:00:00 2001 From: antonrydahl Date: Wed, 20 Sep 2023 10:28:58 -0700 Subject: [PATCH 4/4] Offloading version of transform with three iterator inputs --- .../pstl_backends/cpu_backends/fill.h | 26 +++++++++++++- .../pstl_backends/cpu_backends/transform.h | 36 ++++++++++++++++++- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h index 7f66873069703..b7e12adec1e81 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h @@ -27,9 +27,33 @@ _LIBCPP_BEGIN_NAMESPACE_STD template _LIBCPP_HIDE_FROM_ABI _Index __simd_fill_n(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept { _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED - _PSTL_PRAGMA_SIMD() + _PSTL_OMP_MAP_TO(__first, __n); +# pragma omp target enter data map(to : __value) + _PSTL_PRAGMA_SIMD(__n) for (_DifferenceType __i = 0; __i < __n; ++__i) __first[__i] = __value; + _PSTL_OMP_MAP_FROM(__first, __n); + return __first + __n; +} + +/** + * Specialization for std::vector where the base pointer must be extrated to map + * the data to and from the GPU. + */ + +template +_LIBCPP_HIDE_FROM_ABI std::__wrap_iter +__simd_fill_n(std::__wrap_iter __first, _DifferenceType __n, const _Tp& __value) noexcept { + _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED + _PSTL_OMP_MAP_TO(__first, __n); + // For std::vector the base pointer of the data buffer needs to be extracted + std::pointer_traits> PT; + T* data = PT.to_address(__first); +# pragma omp target enter data map(to : __value) + _PSTL_PRAGMA_SIMD(__n) + for (_DifferenceType __i = 0; __i < __n; ++__i) + data[__i] = __value; + _PSTL_OMP_MAP_FROM(__first, __n); return __first + __n; } diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h index 06348b0fc5c2a..e6cd70b5a420b 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h @@ -96,9 +96,43 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform( template _LIBCPP_HIDE_FROM_ABI _Iterator3 __simd_walk_3( _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept { - _PSTL_PRAGMA_SIMD() + _PSTL_OMP_MAP_TO(__first1, __n); + _PSTL_OMP_MAP_TO(__first2, __n); + _PSTL_OMP_MAP_TO(__first3, __n); + _PSTL_PRAGMA_SIMD(__n) for (_DifferenceType __i = 0; __i < __n; ++__i) __f(__first1[__i], __first2[__i], __first3[__i]); + _PSTL_OMP_MAP_FROM(__first2, __n); + _PSTL_OMP_MAP_FROM(__first3, __n); + return __first3 + __n; +} + +/** + * Specialization for std::vector where the base pointer must be extrated to map + * the data to and from the GPU. + */ + +template +_LIBCPP_HIDE_FROM_ABI std::__wrap_iter +__simd_walk_3(std::__wrap_iter __first1, + _DifferenceType __n, + std::__wrap_iter __first2, + std::__wrap_iter __first3, + _Function __f) noexcept { + _PSTL_OMP_MAP_TO(__first1, __n); + _PSTL_OMP_MAP_TO(__first2, __n); + _PSTL_OMP_MAP_TO(__first3, __n); + std::pointer_traits> PT1; + std::pointer_traits> PT2; + std::pointer_traits> PT3; + T1* __data1 = PT1.to_address(__first1); + T2* __data2 = PT2.to_address(__first2); + T3* __data3 = PT3.to_address(__first3); + _PSTL_PRAGMA_SIMD(__n) + for (_DifferenceType __i = 0; __i < __n; ++__i) + __f(__data1[__i], __data2[__i], __data3[__i]); + _PSTL_OMP_MAP_FROM(__first2, __n); + _PSTL_OMP_MAP_FROM(__first3, __n); return __first3 + __n; } template