From 83a02baecfb980d54d5d15c8c620894b785468d9 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 25 Jul 2023 21:49:16 -0700 Subject: [PATCH 01/23] Add a way to build a shared library with dynamic dispatch --- _clang-format | 2 +- lib/meson.build | 31 ++++++ lib/x86simdsort-icl.cpp | 38 +++++++ lib/x86simdsort-internal.h | 59 +++++++++++ lib/x86simdsort-scalar.h | 48 +++++++++ lib/x86simdsort-skx.cpp | 43 ++++++++ lib/x86simdsort-spr.cpp | 23 ++++ lib/x86simdsort.cpp | 140 +++++++++++++++++++++++++ lib/x86simdsort.h | 23 ++++ meson.build | 8 ++ src/avx512-64bit-keyvalue-networks.hpp | 4 +- 11 files changed, 417 insertions(+), 2 deletions(-) create mode 100644 lib/meson.build create mode 100644 lib/x86simdsort-icl.cpp create mode 100644 lib/x86simdsort-internal.h create mode 100644 lib/x86simdsort-scalar.h create mode 100644 lib/x86simdsort-skx.cpp create mode 100644 lib/x86simdsort-spr.cpp create mode 100644 lib/x86simdsort.cpp create mode 100644 lib/x86simdsort.h diff --git a/_clang-format b/_clang-format index 30f08064..98760584 100644 --- a/_clang-format +++ b/_clang-format @@ -63,7 +63,7 @@ KeepEmptyLinesAtTheStartOfBlocks: true MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None +NamespaceIndentation: Inner PenaltyBreakAssignment: 2 PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 diff --git a/lib/meson.build b/lib/meson.build new file mode 100644 index 00000000..8bfc75f0 --- /dev/null +++ b/lib/meson.build @@ -0,0 +1,31 @@ +libtargets = [] + +if cpp.has_argument('-march=skylake-avx512') + libtargets += static_library('libskx', + files( + 'x86simdsort-skx.cpp', + ), + include_directories : [src], + cpp_args : ['-O3', '-mavx512f', '-mavx512dq', '-mavx512vl'], + ) +endif + +if cpp.has_argument('-march=icelake-client') + libtargets += static_library('libicl', + files( + 'x86simdsort-icl.cpp', + ), + include_directories : [src], + cpp_args : ['-O3', '-mavx512f', '-mavx512vbmi2', '-mavx512bw', '-mavx512vl', '-mf16c'], + ) +endif + +if cancompilefp16 + libtargets += static_library('libspr', + files( + 'x86simdsort-spr.cpp', + ), + include_directories : [src], + cpp_args : ['-O3', '-mavx512f', '-mavx512fp16', '-mavx512vbmi2'], + ) +endif diff --git a/lib/x86simdsort-icl.cpp b/lib/x86simdsort-icl.cpp new file mode 100644 index 00000000..651d0f1b --- /dev/null +++ b/lib/x86simdsort-icl.cpp @@ -0,0 +1,38 @@ +// ICL specific routines: +#include "avx512-16bit-qsort.hpp" +#include "x86simdsort-internal.h" + +namespace xss { +namespace avx512 { + template <> + void qsort(uint16_t *arr, int64_t size) + { + avx512_qsort(arr, size); + } + template <> + void qselect(uint16_t *arr, int64_t k, int64_t arrsize, bool hasnan) + { + avx512_qselect(arr, k, arrsize, hasnan); + } + template <> + void partial_qsort(uint16_t *arr, int64_t k, int64_t arrsize, bool hasnan) + { + avx512_partial_qsort(arr, k, arrsize, hasnan); + } + template <> + void qsort(int16_t *arr, int64_t size) + { + avx512_qsort(arr, size); + } + template <> + void qselect(int16_t *arr, int64_t k, int64_t arrsize, bool hasnan) + { + avx512_qselect(arr, k, arrsize, hasnan); + } + template <> + void partial_qsort(int16_t *arr, int64_t k, int64_t arrsize, bool hasnan) + { + avx512_partial_qsort(arr, k, arrsize, hasnan); + } +} // namespace avx512 +} // namespace xss diff --git a/lib/x86simdsort-internal.h b/lib/x86simdsort-internal.h new file mode 100644 index 00000000..2ddfaefb --- /dev/null +++ b/lib/x86simdsort-internal.h @@ -0,0 +1,59 @@ +#ifndef XSS_ALL_METHODS +#define XSS_ALL_METHODS +#include +#include + +namespace xss { +namespace avx512 { + // quicksort + template + void qsort(T *arr, int64_t arrsize); + // quickselect + template + void qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); + // partial sort + template + void partial_qsort(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); + // argsort + template + std::vector argsort(T *arr, int64_t arrsize); + // argselect + template + std::vector argselect(T *arr, int64_t k, int64_t arrsize); +} // namespace avx512 +namespace avx2 { + // quicksort + template + void qsort(T *arr, int64_t arrsize); + // quickselect + template + void qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); + // partial sort + template + void partial_qsort(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); + // argsort + template + std::vector argsort(T *arr, int64_t arrsize); + // argselect + template + std::vector argselect(T *arr, int64_t k, int64_t arrsize); +} // namespace avx2 +namespace scalar { + // quicksort + template + void qsort(T *arr, int64_t arrsize); + // quickselect + template + void qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); + // partial sort + template + void partial_qsort(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); + // argsort + template + std::vector argsort(T *arr, int64_t arrsize); + // argselect + template + std::vector argselect(T *arr, int64_t k, int64_t arrsize); +} // namespace scalar +} // namespace xss +#endif diff --git a/lib/x86simdsort-scalar.h b/lib/x86simdsort-scalar.h new file mode 100644 index 00000000..b32163d3 --- /dev/null +++ b/lib/x86simdsort-scalar.h @@ -0,0 +1,48 @@ +#include +#include +namespace xss { +namespace scalar { + /* TODO: handle NAN */ + template + void qsort(T *arr, int64_t arrsize) + { + std::sort(arr, arr + arrsize); + } + template + void qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan) + { + std::nth_element(arr, arr + k, arr + arrsize); + } + template + void partial_qsort(T *arr, int64_t k, int64_t arrsize, bool hasnan) + { + std::partial_sort(arr, arr + k, arr + arrsize); + } + template + std::vector argsort(T *arr, int64_t arrsize) + { + std::vector arg(arrsize); + std::iota(arg.begin(), arg.end(), 0); + std::sort(arg.begin(), + arg.end(), + [arr](int64_t left, int64_t right) -> bool { + return arr[left] < arr[right]; + }); + return arg; + } + template + std::vector argselect(T *arr, int64_t k, int64_t arrsize) + { + std::vector arg(arrsize); + std::iota(arg.begin(), arg.end(), 0); + std::nth_element(arg.begin(), + arg.begin() + k, + arg.end(), + [arr](int64_t left, int64_t right) -> bool { + return arr[left] < arr[right]; + }); + return arg; + } + +} // namespace scalar +} // namespace xss diff --git a/lib/x86simdsort-skx.cpp b/lib/x86simdsort-skx.cpp new file mode 100644 index 00000000..25692e60 --- /dev/null +++ b/lib/x86simdsort-skx.cpp @@ -0,0 +1,43 @@ +// SKX specific routines: +#include "avx512-32bit-qsort.hpp" +#include "avx512-64bit-argsort.hpp" +#include "avx512-64bit-qsort.hpp" +#include "x86simdsort-internal.h" + +#define DEFINE_ALL_METHODS(type) \ + template <> \ + void qsort(type *arr, int64_t arrsize) \ + { \ + avx512_qsort(arr, arrsize); \ + } \ + template <> \ + void qselect(type *arr, int64_t k, int64_t arrsize, bool hasnan) \ + { \ + avx512_qselect(arr, k, arrsize, hasnan); \ + } \ + template <> \ + void partial_qsort(type *arr, int64_t k, int64_t arrsize, bool hasnan) \ + { \ + avx512_partial_qsort(arr, k, arrsize, hasnan); \ + } \ + template <> \ + std::vector argsort(type *arr, int64_t arrsize) \ + { \ + return avx512_argsort(arr, arrsize); \ + } \ + template <> \ + std::vector argselect(type *arr, int64_t k, int64_t arrsize) \ + { \ + return avx512_argselect(arr, k, arrsize); \ + } + +namespace xss { +namespace avx512 { + DEFINE_ALL_METHODS(uint32_t) + DEFINE_ALL_METHODS(int32_t) + DEFINE_ALL_METHODS(float) + DEFINE_ALL_METHODS(uint64_t) + DEFINE_ALL_METHODS(int64_t) + DEFINE_ALL_METHODS(double) +} // namespace avx512 +} // namespace xss diff --git a/lib/x86simdsort-spr.cpp b/lib/x86simdsort-spr.cpp new file mode 100644 index 00000000..dd4c1f17 --- /dev/null +++ b/lib/x86simdsort-spr.cpp @@ -0,0 +1,23 @@ +// SPR specific routines: +#include "avx512fp16-16bit-qsort.hpp" +#include "x86simdsort-internal.h" + +namespace xss { +namespace avx512 { + template <> + void qsort(_Float16 *arr, int64_t size) + { + avx512_qsort(arr, size); + } + template <> + void qselect(_Float16 *arr, int64_t k, int64_t arrsize, bool hasnan) + { + avx512_qselect(arr, k, arrsize, hasnan); + } + template <> + void partial_qsort(_Float16 *arr, int64_t k, int64_t arrsize, bool hasnan) + { + avx512_partial_qsort(arr, k, arrsize, hasnan); + } +} // namespace avx512 +} // namespace xss diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp new file mode 100644 index 00000000..983bd1ba --- /dev/null +++ b/lib/x86simdsort.cpp @@ -0,0 +1,140 @@ +#include "x86simdsort.h" +#include "x86simdsort-internal.h" +#include "x86simdsort-scalar.h" +#include +#include +#include + +static int check_cpu_feature_support(std::string_view cpufeature) +{ + if (cpufeature == "avx512_spr") + return __builtin_cpu_supports("avx512f") + && __builtin_cpu_supports("avx512fp16") + && __builtin_cpu_supports("avx512vbmi2"); + else if (cpufeature == "avx512_icl") + return __builtin_cpu_supports("avx512f") + && __builtin_cpu_supports("avx512vbmi2") + && __builtin_cpu_supports("avx512bw") + && __builtin_cpu_supports("avx512vl"); + else if (cpufeature == "avx512_skx") + return __builtin_cpu_supports("avx512f") + && __builtin_cpu_supports("avx512dq") + && __builtin_cpu_supports("avx512vl"); + else if (cpufeature == "avx2") + return __builtin_cpu_supports("avx2"); + + return 0; +} + +std::string_view +find_preferred_cpu(std::initializer_list cpulist) +{ + for (auto cpu : cpulist) { + if (check_cpu_feature_support(cpu)) return cpu; + } + return "scalar"; +} + +constexpr bool +dispatch_requested(std::string_view cpurequested, + std::initializer_list cpulist) +{ + for (auto cpu : cpulist) { + if (cpu.find(cpurequested) != std::string_view::npos) return true; + } + return false; +} + +#define CAT_(a, b) a ## b +#define CAT(a, b) CAT_(a, b) + +#define DECLARE_INTERNAL_qsort(TYPE) \ + static void (*internal_qsort##TYPE)(TYPE *, int64_t) = NULL; \ + template <> \ + void qsort(TYPE *arr, int64_t arrsize) \ + { \ + (*internal_qsort##TYPE)(arr, arrsize); \ + } + +#define DECLARE_INTERNAL_qselect(TYPE) \ + static void (*internal_qselect##TYPE)(TYPE *, int64_t, int64_t, bool) = NULL; \ + template <> \ + void qselect(TYPE *arr, int64_t k, int64_t arrsize, bool hasnan) \ + { \ + (*internal_qselect##TYPE)(arr, k, arrsize, hasnan); \ + } + +#define DECLARE_INTERNAL_partial_qsort(TYPE) \ + static void (*internal_partial_qsort##TYPE)(TYPE *, int64_t, int64_t, bool) = NULL; \ + template <> \ + void partial_qsort(TYPE *arr, int64_t k, int64_t arrsize, bool hasnan) \ + { \ + (*internal_partial_qsort##TYPE)(arr, k, arrsize, hasnan); \ + } + +#define DECLARE_INTERNAL_argsort(TYPE) \ + static std::vector (*internal_argsort##TYPE)(TYPE *, int64_t) = NULL; \ + template <> \ + std::vector argsort(TYPE *arr, int64_t arrsize) \ + { \ + return (*internal_argsort##TYPE)(arr, arrsize); \ + } + +#define DECLARE_INTERNAL_argselect(TYPE) \ + static std::vector (*internal_argselect##TYPE)(TYPE *, int64_t, int64_t) = NULL; \ + template <> \ + std::vector argselect(TYPE *arr, int64_t k, int64_t arrsize) \ + { \ + return (*internal_argselect##TYPE)(arr, k, arrsize); \ + } + +/* runtime dispatch mechanism */ +#define DISPATCH(func, TYPE, ...) \ + DECLARE_INTERNAL_##func(TYPE) \ + static __attribute__((constructor)) void CAT(CAT(resolve_, func), TYPE)(void) \ + { \ + CAT(CAT(internal_, func), TYPE) = &xss::scalar::func; \ + __builtin_cpu_init(); \ + std::string_view preferred_cpu = find_preferred_cpu({__VA_ARGS__}); \ + if constexpr (dispatch_requested("avx512", {__VA_ARGS__})) { \ + if (preferred_cpu.find("avx512") != std::string_view::npos) { \ + CAT(CAT(internal_, func), TYPE) = &xss::avx512::func; \ + return; \ + } \ + } \ + else if constexpr (dispatch_requested("avx2", {__VA_ARGS__})) { \ + if (preferred_cpu.find("avx2") != std::string_view::npos) { \ + CAT(CAT(internal_, func), TYPE) = &xss::avx2::func; \ + return; \ + } \ + } \ + } + + + +namespace x86simdsort { +#ifdef __FLT16_MAX__ +DISPATCH(qsort, _Float16, "avx512_spr") +DISPATCH(qselect, _Float16, "avx512_spr") +DISPATCH(partial_qsort, _Float16, "avx512_spr") +DISPATCH(argsort, _Float16, "none") +DISPATCH(argselect, _Float16, "none") +#endif + +#define DISPATCH_ALL(func, ISA_16BIT, ISA_32BIT, ISA_64BIT) \ + DISPATCH(func, uint16_t, ISA_16BIT)\ + DISPATCH(func, int16_t, ISA_16BIT)\ + DISPATCH(func, float, ISA_32BIT)\ + DISPATCH(func, int32_t, ISA_32BIT)\ + DISPATCH(func, uint32_t, ISA_32BIT)\ + DISPATCH(func, int64_t, ISA_64BIT)\ + DISPATCH(func, uint64_t, ISA_64BIT)\ + DISPATCH(func, double, ISA_64BIT)\ + +DISPATCH_ALL(qsort, ("avx512_icl"), ("avx512_skx"), ("avx512_skx")) +DISPATCH_ALL(qselect, ("avx512_icl"), ("avx512_skx"), ("avx512_skx")) +DISPATCH_ALL(partial_qsort, ("avx512_icl"), ("avx512_skx"), ("avx512_skx")) +DISPATCH_ALL(argsort, "none", "avx512_skx", "avx512_skx") +DISPATCH_ALL(argselect, "none", "avx512_skx", "avx512_skx") + +} // namespace simdsort diff --git a/lib/x86simdsort.h b/lib/x86simdsort.h new file mode 100644 index 00000000..a4a1c7f6 --- /dev/null +++ b/lib/x86simdsort.h @@ -0,0 +1,23 @@ +#ifndef X86_SIMD_SORT +#define X86_SIMD_SORT +#include +#include + +namespace x86simdsort { +// quicksort +template +void qsort(T *arr, int64_t arrsize); +// quickselect +template +void qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); +// partial sort +template +void partial_qsort(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); +// argsort +template +std::vector argsort(T *arr, int64_t arrsize); +// argselect +template +std::vector argselect(T *arr, int64_t k, int64_t arrsize); +} // namespace x86simdsort +#endif diff --git a/meson.build b/meson.build index a10598f9..68c442fb 100644 --- a/meson.build +++ b/meson.build @@ -4,6 +4,7 @@ project('x86-simd-sort', 'cpp', default_options : ['cpp_std=c++17']) cpp = meson.get_compiler('cpp') src = include_directories('src') +lib = include_directories('lib') bench = include_directories('benchmarks') utils = include_directories('utils') tests = include_directories('tests') @@ -19,9 +20,16 @@ int main() { ''' cancompilefp16 = cpp.compiles(fp16code, args:'-march=sapphirerapids') +subdir('lib') subdir('tests') subdir('benchmarks') +libsimdsort = shared_library('x86simdsort', + 'lib/x86simdsort.cpp', + include_directories : [lib], + link_whole : [libtargets], + ) + testexe = executable('testexe', include_directories : [src, utils], dependencies : gtest_dep, diff --git a/src/avx512-64bit-keyvalue-networks.hpp b/src/avx512-64bit-keyvalue-networks.hpp index e9577b79..12527f95 100644 --- a/src/avx512-64bit-keyvalue-networks.hpp +++ b/src/avx512-64bit-keyvalue-networks.hpp @@ -1,4 +1,5 @@ - +#ifndef AVX512_KEYVALUE_NETWORKS +#define AVX512_KEYVALUE_NETWORKS template Date: Wed, 13 Sep 2023 13:09:10 -0700 Subject: [PATCH 02/23] Get rid of unused variable warning --- lib/x86simdsort-scalar.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/x86simdsort-scalar.h b/lib/x86simdsort-scalar.h index b32163d3..8e25de9a 100644 --- a/lib/x86simdsort-scalar.h +++ b/lib/x86simdsort-scalar.h @@ -1,5 +1,6 @@ #include #include +#define UNUSED(x) (void)(x) namespace xss { namespace scalar { /* TODO: handle NAN */ @@ -11,11 +12,13 @@ namespace scalar { template void qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan) { + UNUSED(hasnan); std::nth_element(arr, arr + k, arr + arrsize); } template void partial_qsort(T *arr, int64_t k, int64_t arrsize, bool hasnan) { + UNUSED(hasnan); std::partial_sort(arr, arr + k, arr + arrsize); } template From 72169c058ade41bb27aa2984fa42e94e08f8181d Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 13 Sep 2023 13:11:03 -0700 Subject: [PATCH 03/23] Run clang format --- lib/x86simdsort-scalar.h | 32 ++++++++++++++++---------------- lib/x86simdsort.cpp | 40 ++++++++++++++++++++++------------------ 2 files changed, 38 insertions(+), 34 deletions(-) diff --git a/lib/x86simdsort-scalar.h b/lib/x86simdsort-scalar.h index 8e25de9a..41bd3724 100644 --- a/lib/x86simdsort-scalar.h +++ b/lib/x86simdsort-scalar.h @@ -24,26 +24,26 @@ namespace scalar { template std::vector argsort(T *arr, int64_t arrsize) { - std::vector arg(arrsize); - std::iota(arg.begin(), arg.end(), 0); - std::sort(arg.begin(), - arg.end(), - [arr](int64_t left, int64_t right) -> bool { - return arr[left] < arr[right]; - }); - return arg; + std::vector arg(arrsize); + std::iota(arg.begin(), arg.end(), 0); + std::sort(arg.begin(), + arg.end(), + [arr](int64_t left, int64_t right) -> bool { + return arr[left] < arr[right]; + }); + return arg; } template std::vector argselect(T *arr, int64_t k, int64_t arrsize) { - std::vector arg(arrsize); - std::iota(arg.begin(), arg.end(), 0); - std::nth_element(arg.begin(), - arg.begin() + k, - arg.end(), - [arr](int64_t left, int64_t right) -> bool { - return arr[left] < arr[right]; - }); + std::vector arg(arrsize); + std::iota(arg.begin(), arg.end(), 0); + std::nth_element(arg.begin(), + arg.begin() + k, + arg.end(), + [arr](int64_t left, int64_t right) -> bool { + return arr[left] < arr[right]; + }); return arg; } diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 983bd1ba..6e1e2410 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -45,7 +45,7 @@ dispatch_requested(std::string_view cpurequested, return false; } -#define CAT_(a, b) a ## b +#define CAT_(a, b) a##b #define CAT(a, b) CAT_(a, b) #define DECLARE_INTERNAL_qsort(TYPE) \ @@ -57,7 +57,8 @@ dispatch_requested(std::string_view cpurequested, } #define DECLARE_INTERNAL_qselect(TYPE) \ - static void (*internal_qselect##TYPE)(TYPE *, int64_t, int64_t, bool) = NULL; \ + static void (*internal_qselect##TYPE)(TYPE *, int64_t, int64_t, bool) \ + = NULL; \ template <> \ void qselect(TYPE *arr, int64_t k, int64_t arrsize, bool hasnan) \ { \ @@ -65,7 +66,9 @@ dispatch_requested(std::string_view cpurequested, } #define DECLARE_INTERNAL_partial_qsort(TYPE) \ - static void (*internal_partial_qsort##TYPE)(TYPE *, int64_t, int64_t, bool) = NULL; \ + static void (*internal_partial_qsort##TYPE)( \ + TYPE *, int64_t, int64_t, bool) \ + = NULL; \ template <> \ void partial_qsort(TYPE *arr, int64_t k, int64_t arrsize, bool hasnan) \ { \ @@ -73,7 +76,8 @@ dispatch_requested(std::string_view cpurequested, } #define DECLARE_INTERNAL_argsort(TYPE) \ - static std::vector (*internal_argsort##TYPE)(TYPE *, int64_t) = NULL; \ + static std::vector (*internal_argsort##TYPE)(TYPE *, int64_t) \ + = NULL; \ template <> \ std::vector argsort(TYPE *arr, int64_t arrsize) \ { \ @@ -81,7 +85,9 @@ dispatch_requested(std::string_view cpurequested, } #define DECLARE_INTERNAL_argselect(TYPE) \ - static std::vector (*internal_argselect##TYPE)(TYPE *, int64_t, int64_t) = NULL; \ + static std::vector (*internal_argselect##TYPE)( \ + TYPE *, int64_t, int64_t) \ + = NULL; \ template <> \ std::vector argselect(TYPE *arr, int64_t k, int64_t arrsize) \ { \ @@ -90,8 +96,8 @@ dispatch_requested(std::string_view cpurequested, /* runtime dispatch mechanism */ #define DISPATCH(func, TYPE, ...) \ - DECLARE_INTERNAL_##func(TYPE) \ - static __attribute__((constructor)) void CAT(CAT(resolve_, func), TYPE)(void) \ + DECLARE_INTERNAL_##func(TYPE) static __attribute__((constructor)) void \ + CAT(CAT(resolve_, func), TYPE)(void) \ { \ CAT(CAT(internal_, func), TYPE) = &xss::scalar::func; \ __builtin_cpu_init(); \ @@ -110,8 +116,6 @@ dispatch_requested(std::string_view cpurequested, } \ } - - namespace x86simdsort { #ifdef __FLT16_MAX__ DISPATCH(qsort, _Float16, "avx512_spr") @@ -122,14 +126,14 @@ DISPATCH(argselect, _Float16, "none") #endif #define DISPATCH_ALL(func, ISA_16BIT, ISA_32BIT, ISA_64BIT) \ - DISPATCH(func, uint16_t, ISA_16BIT)\ - DISPATCH(func, int16_t, ISA_16BIT)\ - DISPATCH(func, float, ISA_32BIT)\ - DISPATCH(func, int32_t, ISA_32BIT)\ - DISPATCH(func, uint32_t, ISA_32BIT)\ - DISPATCH(func, int64_t, ISA_64BIT)\ - DISPATCH(func, uint64_t, ISA_64BIT)\ - DISPATCH(func, double, ISA_64BIT)\ + DISPATCH(func, uint16_t, ISA_16BIT) \ + DISPATCH(func, int16_t, ISA_16BIT) \ + DISPATCH(func, float, ISA_32BIT) \ + DISPATCH(func, int32_t, ISA_32BIT) \ + DISPATCH(func, uint32_t, ISA_32BIT) \ + DISPATCH(func, int64_t, ISA_64BIT) \ + DISPATCH(func, uint64_t, ISA_64BIT) \ + DISPATCH(func, double, ISA_64BIT) DISPATCH_ALL(qsort, ("avx512_icl"), ("avx512_skx"), ("avx512_skx")) DISPATCH_ALL(qselect, ("avx512_icl"), ("avx512_skx"), ("avx512_skx")) @@ -137,4 +141,4 @@ DISPATCH_ALL(partial_qsort, ("avx512_icl"), ("avx512_skx"), ("avx512_skx")) DISPATCH_ALL(argsort, "none", "avx512_skx", "avx512_skx") DISPATCH_ALL(argselect, "none", "avx512_skx", "avx512_skx") -} // namespace simdsort +} // namespace x86simdsort From 324713a73418d33c28d0446d974197c003ac181e Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Thu, 14 Sep 2023 09:32:53 -0700 Subject: [PATCH 04/23] Fix build failure for __builtin_cpu_supports(avx512fp16) --- lib/x86simdsort.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 6e1e2410..3c212de3 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -8,9 +8,13 @@ static int check_cpu_feature_support(std::string_view cpufeature) { if (cpufeature == "avx512_spr") +#ifdef __FLT16_MAX__ return __builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512fp16") && __builtin_cpu_supports("avx512vbmi2"); +#else + return 0; +#endif else if (cpufeature == "avx512_icl") return __builtin_cpu_supports("avx512f") && __builtin_cpu_supports("avx512vbmi2") From 1fd5b1b7cc97d4051f92797dcd6449769d48c8c4 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Thu, 14 Sep 2023 13:20:50 -0700 Subject: [PATCH 05/23] Update benchmarks to use x86simdsort shared library --- .../{bench-qsort-common.h => bench-all.cpp} | 40 +++- benchmarks/bench-argsort.hpp | 75 ++----- benchmarks/bench-partial-qsort.hpp | 95 +++------ benchmarks/bench-qselect.hpp | 65 +++--- benchmarks/bench-qsort.cpp | 4 - benchmarks/bench-qsort.hpp | 82 ++----- benchmarks/bench-qsortfp16.cpp | 201 ------------------ benchmarks/bench-tgl.out | 28 --- benchmarks/meson.build | 25 +-- meson.build | 4 +- run-bench.py | 18 +- utils/rand_array.h | 70 +++--- 12 files changed, 190 insertions(+), 517 deletions(-) rename benchmarks/{bench-qsort-common.h => bench-all.cpp} (65%) delete mode 100644 benchmarks/bench-qsort.cpp delete mode 100644 benchmarks/bench-qsortfp16.cpp delete mode 100644 benchmarks/bench-tgl.out diff --git a/benchmarks/bench-qsort-common.h b/benchmarks/bench-all.cpp similarity index 65% rename from benchmarks/bench-qsort-common.h rename to benchmarks/bench-all.cpp index 60792618..1df466cb 100644 --- a/benchmarks/bench-qsort-common.h +++ b/benchmarks/bench-all.cpp @@ -1,14 +1,24 @@ -#ifndef AVX512_BENCH_COMMON -#define AVX512_BENCH_COMMON - -#include "avx512-16bit-qsort.hpp" -#include "avx512-32bit-qsort.hpp" -#include "avx512-64bit-argsort.hpp" -#include "avx512-64bit-qsort.hpp" - +#include "x86simdsort.h" #include "rand_array.h" #include +#ifdef __FLT16_MAX__ +template <> +std::vector<_Float16> get_uniform_rand_array( + int64_t arrsize, + _Float16 max, + _Float16 min) +{ + (void)(max); (void)(min); + std::vector<_Float16> arr; + for (auto jj = 0; jj < arrsize; ++jj) { + _Float16 temp = (float)rand() / (float)(RAND_MAX); + arr.push_back(temp); + } + return arr; +} +#endif + #define MY_BENCHMARK_CAPTURE(func, T, test_case_name, ...) \ BENCHMARK_PRIVATE_DECLARE(func) \ = (::benchmark::internal::RegisterBenchmarkInternal( \ @@ -18,7 +28,7 @@ func(st, __VA_ARGS__); \ }))) -#define BENCH(func, type) \ +#define BENCH_SORT(func, type) \ MY_BENCHMARK_CAPTURE(func, type, smallrandom_128, 128, std::string("random")); \ MY_BENCHMARK_CAPTURE(func, type, smallrandom_256, 256, std::string("random")); \ MY_BENCHMARK_CAPTURE(func, type, smallrandom_512, 512, std::string("random")); \ @@ -37,4 +47,14 @@ MY_BENCHMARK_CAPTURE( \ func, type, reverse_10k, 10000, std::string("reverse")); -#endif +#define BENCH_PARTIAL(func, type) \ + MY_BENCHMARK_CAPTURE(func, type, k10, 10000, 10); \ + MY_BENCHMARK_CAPTURE(func, type, k100, 10000, 100); \ + MY_BENCHMARK_CAPTURE(func, type, k1000, 10000, 1000); \ + MY_BENCHMARK_CAPTURE(func, type, k5000, 10000, 5000); \ + +#include "bench-argsort.hpp" +#include "bench-partial-qsort.hpp" +#include "bench-qselect.hpp" +#include "bench-qsort.hpp" + diff --git a/benchmarks/bench-argsort.hpp b/benchmarks/bench-argsort.hpp index 905fb581..cf1a39a1 100644 --- a/benchmarks/bench-argsort.hpp +++ b/benchmarks/bench-argsort.hpp @@ -1,5 +1,3 @@ -#include "bench-qsort-common.h" - template std::vector stdargsort(const std::vector &array) { @@ -16,77 +14,42 @@ std::vector stdargsort(const std::vector &array) } template -static void stdargsort(benchmark::State &state, Args &&...args) +static void scalarargsort(benchmark::State &state, Args &&...args) { + // get args auto args_tuple = std::make_tuple(std::move(args)...); - // Perform setup here - size_t ARRSIZE = std::get<0>(args_tuple); - std::vector arr; - std::vector inx; - + size_t arrsize = std::get<0>(args_tuple); std::string arrtype = std::get<1>(args_tuple); - if (arrtype == "random") { arr = get_uniform_rand_array(ARRSIZE); } - else if (arrtype == "sorted") { - arr = get_uniform_rand_array(ARRSIZE); - std::sort(arr.begin(), arr.end()); - } - else if (arrtype == "constant") { - T temp = get_uniform_rand_array(1)[0]; - for (size_t ii = 0; ii < ARRSIZE; ++ii) { - arr.push_back(temp); - } - } - else if (arrtype == "reverse") { - arr = get_uniform_rand_array(ARRSIZE); - std::sort(arr.begin(), arr.end()); - std::reverse(arr.begin(), arr.end()); - } - - /* call avx512 quicksort */ + // set up array + std::vector arr = get_array(arrtype, arrsize); + std::vector arr_bkp = arr; + std::vector inx; + // benchmark for (auto _ : state) { inx = stdargsort(arr); } } template -static void avx512argsort(benchmark::State &state, Args &&...args) +static void simdargsort(benchmark::State &state, Args &&...args) { + // get args auto args_tuple = std::make_tuple(std::move(args)...); - if (!__builtin_cpu_supports("avx512bw")) { - state.SkipWithMessage("Requires AVX512 BW ISA"); - } - // Perform setup here - size_t ARRSIZE = std::get<0>(args_tuple); - std::vector arr; - std::vector inx; - + size_t arrsize = std::get<0>(args_tuple); std::string arrtype = std::get<1>(args_tuple); - if (arrtype == "random") { arr = get_uniform_rand_array(ARRSIZE); } - else if (arrtype == "sorted") { - arr = get_uniform_rand_array(ARRSIZE); - std::sort(arr.begin(), arr.end()); - } - else if (arrtype == "constant") { - T temp = get_uniform_rand_array(1)[0]; - for (size_t ii = 0; ii < ARRSIZE; ++ii) { - arr.push_back(temp); - } - } - else if (arrtype == "reverse") { - arr = get_uniform_rand_array(ARRSIZE); - std::sort(arr.begin(), arr.end()); - std::reverse(arr.begin(), arr.end()); - } - - /* call avx512 quicksort */ + // set up array + std::vector arr = get_array(arrtype, arrsize); + std::vector arr_bkp = arr; + std::vector inx; + // benchmark for (auto _ : state) { - inx = avx512_argsort(arr.data(), ARRSIZE); + inx = x86simdsort::argsort(arr.data(), arrsize); } } #define BENCH_BOTH(type) \ - BENCH(avx512argsort, type) \ - BENCH(stdargsort, type) + BENCH_SORT(simdargsort, type) \ + BENCH_SORT(scalarargsort, type) BENCH_BOTH(int64_t) BENCH_BOTH(uint64_t) diff --git a/benchmarks/bench-partial-qsort.hpp b/benchmarks/bench-partial-qsort.hpp index c5091392..77663d39 100644 --- a/benchmarks/bench-partial-qsort.hpp +++ b/benchmarks/bench-partial-qsort.hpp @@ -1,17 +1,10 @@ -#include "bench-qsort-common.h" - -template -static void avx512_partial_qsort(benchmark::State &state) +template +static void simdpartialsort(benchmark::State &state, Args &&...args) { - if (!__builtin_cpu_supports("avx512bw")) { - state.SkipWithMessage("Requires AVX512 BW ISA"); - } - if ((sizeof(T) == 2) && (!__builtin_cpu_supports("avx512vbmi2"))) { - state.SkipWithMessage("Requires AVX512 VBMI2 ISA"); - } // Perform setup here - int64_t K = state.range(0); - size_t ARRSIZE = 10000; + auto args_tuple = std::make_tuple(std::move(args)...); + int64_t ARRSIZE = std::get<0>(args_tuple); + int64_t k = std::get<1>(args_tuple); std::vector arr; std::vector arr_bkp; @@ -19,9 +12,9 @@ static void avx512_partial_qsort(benchmark::State &state) arr = get_uniform_rand_array(ARRSIZE); arr_bkp = arr; - /* call avx512_partial_qsort */ + /* call simdpartialsort */ for (auto _ : state) { - avx512_partial_qsort(arr.data(), K, ARRSIZE); + x86simdsort::partial_qsort(arr.data(), k, ARRSIZE); state.PauseTiming(); arr = arr_bkp; @@ -29,12 +22,13 @@ static void avx512_partial_qsort(benchmark::State &state) } } -template -static void stdpartialsort(benchmark::State &state) +template +static void scalarpartialsort(benchmark::State &state, Args &&...args) { // Perform setup here - int64_t K = state.range(0); - size_t ARRSIZE = 10000; + auto args_tuple = std::make_tuple(std::move(args)...); + int64_t ARRSIZE = std::get<0>(args_tuple); + int64_t k = std::get<1>(args_tuple); std::vector arr; std::vector arr_bkp; @@ -44,7 +38,7 @@ static void stdpartialsort(benchmark::State &state) /* call std::partial_sort */ for (auto _ : state) { - std::partial_sort(arr.begin(), arr.begin() + K, arr.end()); + std::partial_sort(arr.begin(), arr.begin() + k, arr.end()); state.PauseTiming(); arr = arr_bkp; @@ -52,51 +46,18 @@ static void stdpartialsort(benchmark::State &state) } } -// Register the function as a benchmark -BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_partial_qsort) - ->Arg(10) - ->Arg(100) - ->Arg(1000) - ->Arg(5000); -BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_partial_qsort) - ->Arg(10) - ->Arg(100) - ->Arg(1000) - ->Arg(5000); -BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); - -BENCHMARK(avx512_partial_qsort) - ->Arg(10) - ->Arg(100) - ->Arg(1000) - ->Arg(5000); -BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_partial_qsort) - ->Arg(10) - ->Arg(100) - ->Arg(1000) - ->Arg(5000); -BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_partial_qsort) - ->Arg(10) - ->Arg(100) - ->Arg(1000) - ->Arg(5000); -BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); - -//BENCHMARK(avx512_partial_qsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_partial_qsort) - ->Arg(10) - ->Arg(100) - ->Arg(1000) - ->Arg(5000); -BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_partial_qsort) - ->Arg(10) - ->Arg(100) - ->Arg(1000) - ->Arg(5000); -BENCHMARK(stdpartialsort)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +#define BENCH_BOTH_PARTIAL(type) \ + BENCH_PARTIAL(simdpartialsort, type) \ + BENCH_PARTIAL(scalarpartialsort, type) + +BENCH_BOTH_PARTIAL(uint64_t) +BENCH_BOTH_PARTIAL(int64_t) +BENCH_BOTH_PARTIAL(uint32_t) +BENCH_BOTH_PARTIAL(int32_t) +BENCH_BOTH_PARTIAL(uint16_t) +BENCH_BOTH_PARTIAL(int16_t) +BENCH_BOTH_PARTIAL(float) +BENCH_BOTH_PARTIAL(double) +#ifdef __FLT16_MAX__ +BENCH_BOTH_PARTIAL(_Float16) +#endif diff --git a/benchmarks/bench-qselect.hpp b/benchmarks/bench-qselect.hpp index af3c401a..0dab181e 100644 --- a/benchmarks/bench-qselect.hpp +++ b/benchmarks/bench-qselect.hpp @@ -1,17 +1,10 @@ -#include "bench-qsort-common.h" - -template -static void avx512_qselect(benchmark::State &state) +template +static void simdqselect(benchmark::State &state, Args &&...args) { - if (!__builtin_cpu_supports("avx512bw")) { - state.SkipWithMessage("Requires AVX512 BW ISA"); - } - if ((sizeof(T) == 2) && (!__builtin_cpu_supports("avx512vbmi2"))) { - state.SkipWithMessage("Requires AVX512 VBMI2 ISA"); - } // Perform setup here - int64_t K = state.range(0); - size_t ARRSIZE = 10000; + auto args_tuple = std::make_tuple(std::move(args)...); + int64_t ARRSIZE = std::get<0>(args_tuple); + int64_t k = std::get<1>(args_tuple); std::vector arr; std::vector arr_bkp; @@ -21,7 +14,7 @@ static void avx512_qselect(benchmark::State &state) /* call avx512 quickselect */ for (auto _ : state) { - avx512_qselect(arr.data(), K, ARRSIZE); + x86simdsort::qselect(arr.data(), k, ARRSIZE); state.PauseTiming(); arr = arr_bkp; @@ -29,12 +22,13 @@ static void avx512_qselect(benchmark::State &state) } } -template -static void stdnthelement(benchmark::State &state) +template +static void scalarqselect(benchmark::State &state, Args &&...args) { // Perform setup here - int64_t K = state.range(0); - size_t ARRSIZE = 10000; + auto args_tuple = std::make_tuple(std::move(args)...); + int64_t ARRSIZE = std::get<0>(args_tuple); + int64_t k = std::get<1>(args_tuple); std::vector arr; std::vector arr_bkp; @@ -44,7 +38,7 @@ static void stdnthelement(benchmark::State &state) /* call std::nth_element */ for (auto _ : state) { - std::nth_element(arr.begin(), arr.begin() + K, arr.end()); + std::nth_element(arr.begin(), arr.begin() + k, arr.end()); state.PauseTiming(); arr = arr_bkp; @@ -52,23 +46,18 @@ static void stdnthelement(benchmark::State &state) } } -// Register the function as a benchmark -BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); - -BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); - -//BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(avx512_qselect)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); +#define BENCH_BOTH_QSELECT(type) \ + BENCH_PARTIAL(simdqselect, type) \ + BENCH_PARTIAL(scalarqselect, type) + +BENCH_BOTH_QSELECT(uint64_t) +BENCH_BOTH_QSELECT(int64_t) +BENCH_BOTH_QSELECT(uint32_t) +BENCH_BOTH_QSELECT(int32_t) +BENCH_BOTH_QSELECT(uint16_t) +BENCH_BOTH_QSELECT(int16_t) +BENCH_BOTH_QSELECT(float) +BENCH_BOTH_QSELECT(double) +#ifdef __FLT16_MAX__ +BENCH_BOTH_QSELECT(_Float16) +#endif diff --git a/benchmarks/bench-qsort.cpp b/benchmarks/bench-qsort.cpp deleted file mode 100644 index 97e78ffc..00000000 --- a/benchmarks/bench-qsort.cpp +++ /dev/null @@ -1,4 +0,0 @@ -#include "bench-qsort.hpp" -#include "bench-argsort.hpp" -#include "bench-partial-qsort.hpp" -#include "bench-qselect.hpp" diff --git a/benchmarks/bench-qsort.hpp b/benchmarks/bench-qsort.hpp index 3b03b1da..277f7bf5 100644 --- a/benchmarks/bench-qsort.hpp +++ b/benchmarks/bench-qsort.hpp @@ -1,34 +1,14 @@ -#include "bench-qsort-common.h" - template -static void stdsort(benchmark::State &state, Args &&...args) +static void scalarsort(benchmark::State &state, Args &&...args) { + // Get args auto args_tuple = std::make_tuple(std::move(args)...); - // Perform setup here - size_t ARRSIZE = std::get<0>(args_tuple); - std::vector arr; - std::vector arr_bkp; - + size_t arrsize = std::get<0>(args_tuple); std::string arrtype = std::get<1>(args_tuple); - if (arrtype == "random") { arr = get_uniform_rand_array(ARRSIZE); } - else if (arrtype == "sorted") { - arr = get_uniform_rand_array(ARRSIZE); - std::sort(arr.begin(), arr.end()); - } - else if (arrtype == "constant") { - T temp = get_uniform_rand_array(1)[0]; - for (size_t ii = 0; ii < ARRSIZE; ++ii) { - arr.push_back(temp); - } - } - else if (arrtype == "reverse") { - arr = get_uniform_rand_array(ARRSIZE); - std::sort(arr.begin(), arr.end()); - std::reverse(arr.begin(), arr.end()); - } - arr_bkp = arr; - - /* call avx512 quicksort */ + // set up array + std::vector arr = get_array(arrtype, arrsize); + std::vector arr_bkp = arr; + // benchmark for (auto _ : state) { std::sort(arr.begin(), arr.end()); state.PauseTiming(); @@ -38,42 +18,18 @@ static void stdsort(benchmark::State &state, Args &&...args) } template -static void avx512qsort(benchmark::State &state, Args &&...args) +static void simdsort(benchmark::State &state, Args &&...args) { + // Get args auto args_tuple = std::make_tuple(std::move(args)...); - if (!__builtin_cpu_supports("avx512bw")) { - state.SkipWithMessage("Requires AVX512 BW ISA"); - } - if ((sizeof(T) == 2) && (!__builtin_cpu_supports("avx512vbmi2"))) { - state.SkipWithMessage("Requires AVX512 VBMI2"); - } - // Perform setup here - size_t ARRSIZE = std::get<0>(args_tuple); - std::vector arr; - std::vector arr_bkp; - + size_t arrsize = std::get<0>(args_tuple); std::string arrtype = std::get<1>(args_tuple); - if (arrtype == "random") { arr = get_uniform_rand_array(ARRSIZE); } - else if (arrtype == "sorted") { - arr = get_uniform_rand_array(ARRSIZE); - std::sort(arr.begin(), arr.end()); - } - else if (arrtype == "constant") { - T temp = get_uniform_rand_array(1)[0]; - for (size_t ii = 0; ii < ARRSIZE; ++ii) { - arr.push_back(temp); - } - } - else if (arrtype == "reverse") { - arr = get_uniform_rand_array(ARRSIZE); - std::sort(arr.begin(), arr.end()); - std::reverse(arr.begin(), arr.end()); - } - arr_bkp = arr; - - /* call avx512 quicksort */ + // set up array + std::vector arr = get_array(arrtype, arrsize); + std::vector arr_bkp = arr; + // benchmark for (auto _ : state) { - avx512_qsort(arr.data(), ARRSIZE); + x86simdsort::qsort(arr.data(), arrsize); state.PauseTiming(); arr = arr_bkp; state.ResumeTiming(); @@ -81,8 +37,8 @@ static void avx512qsort(benchmark::State &state, Args &&...args) } #define BENCH_BOTH_QSORT(type) \ - BENCH(avx512qsort, type) \ - BENCH(stdsort, type) + BENCH_SORT(simdsort, type) \ + BENCH_SORT(scalarsort, type) BENCH_BOTH_QSORT(uint64_t) BENCH_BOTH_QSORT(int64_t) @@ -92,3 +48,7 @@ BENCH_BOTH_QSORT(uint16_t) BENCH_BOTH_QSORT(int16_t) BENCH_BOTH_QSORT(float) BENCH_BOTH_QSORT(double) +#ifdef __FLT16_MAX__ +BENCH_BOTH_QSORT(_Float16) +#endif + diff --git a/benchmarks/bench-qsortfp16.cpp b/benchmarks/bench-qsortfp16.cpp deleted file mode 100644 index 769c2c2f..00000000 --- a/benchmarks/bench-qsortfp16.cpp +++ /dev/null @@ -1,201 +0,0 @@ -#include "avx512fp16-16bit-qsort.hpp" - -#include "rand_array.h" -#include - -template -static void avx512_qsort(benchmark::State &state) -{ - if (__builtin_cpu_supports("avx512fp16")) { - // Perform setup here - size_t ARRSIZE = state.range(0); - std::vector arr; - std::vector arr_bkp; - - /* Initialize elements */ - for (size_t jj = 0; jj < ARRSIZE; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - } - arr_bkp = arr; - - /* call avx512 quicksort */ - for (auto _ : state) { - avx512_qsort(arr.data(), ARRSIZE); - state.PauseTiming(); - arr = arr_bkp; - state.ResumeTiming(); - } - } - else { - state.SkipWithMessage("Requires AVX512-FP16 ISA"); - } -} - -template -static void stdsort(benchmark::State &state) -{ - if (__builtin_cpu_supports("avx512fp16")) { - // Perform setup here - size_t ARRSIZE = state.range(0); - std::vector arr; - std::vector arr_bkp; - - for (size_t jj = 0; jj < ARRSIZE; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - } - arr_bkp = arr; - - /* call std::sort */ - for (auto _ : state) { - std::sort(arr.begin(), arr.end()); - state.PauseTiming(); - arr = arr_bkp; - state.ResumeTiming(); - } - } - else { - state.SkipWithMessage("Requires AVX512-FP16 ISA"); - } -} - -// Register the function as a benchmark -BENCHMARK(avx512_qsort<_Float16>)->Arg(10000)->Arg(1000000); -BENCHMARK(stdsort<_Float16>)->Arg(10000)->Arg(1000000); - -template -static void avx512_qselect(benchmark::State &state) -{ - if (__builtin_cpu_supports("avx512fp16")) { - // Perform setup here - int64_t K = state.range(0); - size_t ARRSIZE = 10000; - std::vector arr; - std::vector arr_bkp; - - /* Initialize elements */ - for (size_t jj = 0; jj < ARRSIZE; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - } - arr_bkp = arr; - - /* call avx512 quickselect */ - for (auto _ : state) { - avx512_qselect(arr.data(), K, ARRSIZE); - - state.PauseTiming(); - arr = arr_bkp; - state.ResumeTiming(); - } - } - else { - state.SkipWithMessage("Requires AVX512-FP16 ISA"); - } -} - -template -static void stdnthelement(benchmark::State &state) -{ - if (__builtin_cpu_supports("avx512fp16")) { - // Perform setup here - int64_t K = state.range(0); - size_t ARRSIZE = 10000; - std::vector arr; - std::vector arr_bkp; - - /* Initialize elements */ - for (size_t jj = 0; jj < ARRSIZE; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - } - arr_bkp = arr; - - /* call std::nth_element */ - for (auto _ : state) { - std::nth_element(arr.begin(), arr.begin() + K, arr.end()); - - state.PauseTiming(); - arr = arr_bkp; - state.ResumeTiming(); - } - } - else { - state.SkipWithMessage("Requires AVX512-FP16 ISA"); - } -} - -// Register the function as a benchmark -BENCHMARK(avx512_qselect<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); -BENCHMARK(stdnthelement<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); - -template -static void avx512_partial_qsort(benchmark::State &state) -{ - if (__builtin_cpu_supports("avx512fp16")) { - // Perform setup here - int64_t K = state.range(0); - size_t ARRSIZE = 10000; - std::vector arr; - std::vector arr_bkp; - - /* Initialize elements */ - for (size_t jj = 0; jj < ARRSIZE; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - } - arr_bkp = arr; - - /* call avx512_partial_qsort */ - for (auto _ : state) { - avx512_partial_qsort(arr.data(), K, ARRSIZE); - - state.PauseTiming(); - arr = arr_bkp; - state.ResumeTiming(); - } - } - else { - state.SkipWithMessage("Requires AVX512-FP16 ISA"); - } -} - -template -static void stdpartialsort(benchmark::State &state) -{ - if (__builtin_cpu_supports("avx512fp16")) { - // Perform setup here - int64_t K = state.range(0); - size_t ARRSIZE = 10000; - std::vector arr; - std::vector arr_bkp; - - /* Initialize elements */ - for (size_t jj = 0; jj < ARRSIZE; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - } - arr_bkp = arr; - - /* call std::partial_sort */ - for (auto _ : state) { - std::partial_sort(arr.begin(), arr.begin() + K, arr.end()); - - state.PauseTiming(); - arr = arr_bkp; - state.ResumeTiming(); - } - } - else { - state.SkipWithMessage("Requires AVX512-FP16 ISA"); - } -} - -// Register the function as a benchmark -BENCHMARK(avx512_partial_qsort<_Float16>) - ->Arg(10) - ->Arg(100) - ->Arg(1000) - ->Arg(5000); -BENCHMARK(stdpartialsort<_Float16>)->Arg(10)->Arg(100)->Arg(1000)->Arg(5000); diff --git a/benchmarks/bench-tgl.out b/benchmarks/bench-tgl.out deleted file mode 100644 index 1bb03936..00000000 --- a/benchmarks/bench-tgl.out +++ /dev/null @@ -1,28 +0,0 @@ -|-----------------+-------------+------------+-----------------+-----------+----------| -| Array data type | typeid name | array size | avx512_qsort | std::sort | speed up | -|-----------------+-------------+------------+-----------------+-----------+----------| -| uniform random | uint32_t | 10000 | 115697 | 1579118 | 13.6 | -| uniform random | uint32_t | 100000 | 1786812 | 19973203 | 11.2 | -| uniform random | uint32_t | 1000000 | 22536966 | 233470422 | 10.4 | -| uniform random | int32_t | 10000 | 95591 | 1569108 | 16.4 | -| uniform random | int32_t | 100000 | 1790362 | 19785007 | 11.1 | -| uniform random | int32_t | 1000000 | 22874571 | 233358497 | 10.2 | -| uniform random | float | 10000 | 113316 | 1668407 | 14.7 | -| uniform random | float | 100000 | 1920018 | 21815024 | 11.4 | -| uniform random | float | 1000000 | 24776954 | 256867990 | 10.4 | -| uniform random | uint64_t | 10000 | 233501 | 1537649 | 6.6 | -| uniform random | uint64_t | 100000 | 3991372 | 19559859 | 4.9 | -| uniform random | uint64_t | 1000000 | 49818870 | 232687666 | 4.7 | -| uniform random | int64_t | 10000 | 228000 | 1445131 | 6.3 | -| uniform random | int64_t | 100000 | 3892092 | 18917322 | 4.9 | -| uniform random | int64_t | 1000000 | 48957088 | 235100259 | 4.8 | -| uniform random | double | 10000 | 180307 | 1702801 | 9.4 | -| uniform random | double | 100000 | 3596886 | 21849587 | 6.1 | -| uniform random | double | 1000000 | 47724381 | 258014177 | 5.4 | -| uniform random | uint16_t | 10000 | 84732 | 1548275 | 18.3 | -| uniform random | uint16_t | 100000 | 1406417 | 19632858 | 14.0 | -| uniform random | uint16_t | 1000000 | 17119960 | 214085305 | 12.5 | -| uniform random | int16_t | 10000 | 84703 | 1547726 | 18.3 | -| uniform random | int16_t | 100000 | 1442726 | 19705242 | 13.7 | -| uniform random | int16_t | 1000000 | 20210224 | 212137465 | 10.5 | -|-----------------+-------------+------------+-----------------+-----------+----------| \ No newline at end of file diff --git a/benchmarks/meson.build b/benchmarks/meson.build index d7b62b07..fe126f15 100644 --- a/benchmarks/meson.build +++ b/benchmarks/meson.build @@ -1,19 +1,10 @@ libbench = [] -if cpp.has_argument('-march=icelake-client') - libbench += static_library('bench_qsort', - files('bench-qsort.cpp', ), - dependencies: gbench_dep, - include_directories : [src, utils], - cpp_args : ['-O3', '-march=icelake-client'], - ) -endif - -if cancompilefp16 - libbench += static_library('bench_qsortfp16', - files('bench-qsortfp16.cpp', ), - dependencies: gbench_dep, - include_directories : [src, utils], - cpp_args : ['-O3', '-march=sapphirerapids'], - ) -endif +libbench += static_library('bench_qsort', + files( + 'bench-all.cpp', + ), + dependencies: gbench_dep, + include_directories : [src, lib, utils], + cpp_args : ['-O3'], + ) diff --git a/meson.build b/meson.build index 68c442fb..71a566f6 100644 --- a/meson.build +++ b/meson.build @@ -28,6 +28,7 @@ libsimdsort = shared_library('x86simdsort', 'lib/x86simdsort.cpp', include_directories : [lib], link_whole : [libtargets], + cpp_args : ['-O3'], ) testexe = executable('testexe', @@ -37,10 +38,11 @@ testexe = executable('testexe', ) benchexe = executable('benchexe', - include_directories : [src, utils, bench], + include_directories : [src, lib, utils, bench], dependencies : [gbench_dep], link_args: ['-lbenchmark_main'], link_whole : [libbench], + link_with : libsimdsort, ) summary({ diff --git a/run-bench.py b/run-bench.py index c93cfae9..cf86d0da 100644 --- a/run-bench.py +++ b/run-bench.py @@ -19,17 +19,17 @@ baseline = "" contender = "" if "qsort" in args.benchcompare: - baseline = "stdsort.*" + filterb - contender = "avx512qsort.*" + filterb - elif "qselect" in args.benchcompare: - baseline = "stdnthelement.*" + filterb - contender = "avx512_qselect.*" + filterb + baseline = "scalarsort.*" + filterb + contender = "simdsort.*" + filterb + elif "select" in args.benchcompare: + baseline = "scalarqselect.*" + filterb + contender = "simdqselect.*" + filterb elif "partial" in args.benchcompare: - baseline = "stdpartialsort.*" + filterb - contender = "avx512_partial_qsort.*" + filterb + baseline = "scalarpartialsort.*" + filterb + contender = "simdpartialsort.*" + filterb elif "argsort" in args.benchcompare: - baseline = "stdargsort.*" + filterb - contender = "avx512argsort.*" + filterb + baseline = "scalarargsort.*" + filterb + contender = "simdargsort.*" + filterb else: parser.print_help(sys.stderr) parser.error("ERROR: Unknown argument '%s'" % args.benchcompare) diff --git a/utils/rand_array.h b/utils/rand_array.h index a780f50d..076cf8e4 100644 --- a/utils/rand_array.h +++ b/utils/rand_array.h @@ -2,44 +2,38 @@ * * Copyright (C) 2022 Intel Corporation * * SPDX-License-Identifier: BSD-3-Clause * *******************************************/ +#ifndef UTILS_RAND_ARRAY +#define UTILS_RAND_ARRAY #include #include #include #include +#include template static std::vector get_uniform_rand_array( int64_t arrsize, T max = std::numeric_limits::max(), - T min = std::numeric_limits::min(), - typename std::enable_if::value>::type * = 0) + T min = std::numeric_limits::min()) { std::vector arr; - std::random_device r; - std::default_random_engine e1(r()); - e1.seed(42); - std::uniform_int_distribution uniform_dist(min, max); - for (int64_t ii = 0; ii < arrsize; ++ii) { - arr.emplace_back(uniform_dist(e1)); - } - return arr; -} - -template -static std::vector get_uniform_rand_array( - int64_t arrsize, - T max = std::numeric_limits::max(), - T min = std::numeric_limits::min(), - typename std::enable_if::value>::type * = 0) -{ std::random_device rd; - std::mt19937 gen(rd()); - gen.seed(42); - std::uniform_real_distribution dis(min, max); - std::vector arr; - for (int64_t ii = 0; ii < arrsize; ++ii) { - arr.emplace_back(dis(gen)); + if constexpr(std::is_floating_point_v) { + std::mt19937 gen(rd()); + gen.seed(42); + std::uniform_real_distribution dis(min, max); + for (int64_t ii = 0; ii < arrsize; ++ii) { + arr.emplace_back(dis(gen)); + } + } + else if constexpr(std::is_integral_v) { + std::default_random_engine e1(rd()); + e1.seed(42); + std::uniform_int_distribution uniform_dist(min, max); + for (int64_t ii = 0; ii < arrsize; ++ii) { + arr.emplace_back(uniform_dist(e1)); + } } return arr; } @@ -56,3 +50,29 @@ get_uniform_rand_array_with_uniquevalues(int64_t arrsize, arr.resize(std::distance(arr.begin(), ip)); return arr; } + +template +static std::vector +get_array(std::string arrtype, int64_t ARRSIZE) +{ + std::vector arr; + if (arrtype == "random") { arr = get_uniform_rand_array(ARRSIZE); } + else if (arrtype == "sorted") { + arr = get_uniform_rand_array(ARRSIZE); + std::sort(arr.begin(), arr.end()); + } + else if (arrtype == "constant") { + T temp = get_uniform_rand_array(1)[0]; + for (auto ii = 0; ii < ARRSIZE; ++ii) { + arr.push_back(temp); + } + } + else if (arrtype == "reverse") { + arr = get_uniform_rand_array(ARRSIZE); + std::sort(arr.begin(), arr.end()); + std::reverse(arr.begin(), arr.end()); + } + return arr; +} + +#endif // UTILS_RAND_ARRAY From b49a0f81fd4cf719c4b8e2223940dd0f717fbfdb Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 18 Sep 2023 13:37:00 -0700 Subject: [PATCH 06/23] Update tests to use x86simdsort library --- benchmarks/bench-argsort.hpp | 2 - meson.build | 5 +- tests/meson.build | 42 ++---- tests/test-argselect.hpp | 49 ------- tests/test-argsort-common.h | 81 ----------- tests/test-argsort.cpp | 9 -- tests/test-argsort.hpp | 272 ----------------------------------- tests/test-keyvalue.cpp | 87 ----------- tests/test-partial-qsort.hpp | 51 ------- tests/test-qselect.hpp | 112 --------------- tests/test-qsort-common.h | 145 ++++++++++++++++++- tests/test-qsort-fp.hpp | 49 ------- tests/test-qsort.cpp | 170 ++++++++++++++++++++-- tests/test-qsort.hpp | 172 ---------------------- tests/test-qsortfp16.cpp | 161 --------------------- utils/rand_array.h | 61 +++++++- 16 files changed, 373 insertions(+), 1095 deletions(-) delete mode 100644 tests/test-argselect.hpp delete mode 100644 tests/test-argsort-common.h delete mode 100644 tests/test-argsort.cpp delete mode 100644 tests/test-argsort.hpp delete mode 100644 tests/test-keyvalue.cpp delete mode 100644 tests/test-partial-qsort.hpp delete mode 100644 tests/test-qselect.hpp delete mode 100644 tests/test-qsort-fp.hpp delete mode 100644 tests/test-qsort.hpp delete mode 100644 tests/test-qsortfp16.cpp diff --git a/benchmarks/bench-argsort.hpp b/benchmarks/bench-argsort.hpp index cf1a39a1..66bb7bca 100644 --- a/benchmarks/bench-argsort.hpp +++ b/benchmarks/bench-argsort.hpp @@ -22,7 +22,6 @@ static void scalarargsort(benchmark::State &state, Args &&...args) std::string arrtype = std::get<1>(args_tuple); // set up array std::vector arr = get_array(arrtype, arrsize); - std::vector arr_bkp = arr; std::vector inx; // benchmark for (auto _ : state) { @@ -39,7 +38,6 @@ static void simdargsort(benchmark::State &state, Args &&...args) std::string arrtype = std::get<1>(args_tuple); // set up array std::vector arr = get_array(arrtype, arrsize); - std::vector arr_bkp = arr; std::vector inx; // benchmark for (auto _ : state) { diff --git a/meson.build b/meson.build index 71a566f6..32364f45 100644 --- a/meson.build +++ b/meson.build @@ -32,9 +32,10 @@ libsimdsort = shared_library('x86simdsort', ) testexe = executable('testexe', - include_directories : [src, utils], + include_directories : [lib, utils], dependencies : gtest_dep, - link_whole : [libtests] + link_whole : [libtests], + link_with : libsimdsort, ) benchexe = executable('benchexe', diff --git a/tests/meson.build b/tests/meson.build index ac0ce341..fece2415 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -1,31 +1,17 @@ libtests = [] -if cpp.has_argument('-march=skylake-avx512') - libtests += static_library('tests_kv', - files( - 'test-keyvalue.cpp', - 'test-argsort.cpp', - ), - dependencies: gtest_dep, - include_directories : [src, utils], - cpp_args : ['-O3', '-march=skylake-avx512'], - ) -endif +libtests += static_library('tests_qsort', + files('test-qsort.cpp', ), + dependencies: gtest_dep, + include_directories : [lib, utils], + cpp_args : ['-O3'], + ) -if cpp.has_argument('-march=icelake-client') - libtests += static_library('tests_qsort', - files('test-qsort.cpp', ), - dependencies: gtest_dep, - include_directories : [src, utils], - cpp_args : ['-O3', '-march=icelake-client'], - ) -endif - -if cancompilefp16 - libtests += static_library('tests_qsortfp16', - files('test-qsortfp16.cpp', ), - dependencies: gtest_dep, - include_directories : [src, utils], - cpp_args : ['-O3', '-march=sapphirerapids'], - ) -endif +#if cancompilefp16 +# libtests += static_library('tests_qsortfp16', +# files('test-qsortfp16.cpp', ), +# dependencies: gtest_dep, +# include_directories : [src, utils], +# cpp_args : ['-O3', '-march=sapphirerapids'], +# ) +#endif diff --git a/tests/test-argselect.hpp b/tests/test-argselect.hpp deleted file mode 100644 index 13506283..00000000 --- a/tests/test-argselect.hpp +++ /dev/null @@ -1,49 +0,0 @@ -/******************************************* - * * Copyright (C) 2023 Intel Corporation - * * SPDX-License-Identifier: BSD-3-Clause - * *******************************************/ - -template -class avx512argselect : public ::testing::Test { -}; - -TYPED_TEST_SUITE_P(avx512argselect); - -TYPED_TEST_P(avx512argselect, test_random) -{ - if (__builtin_cpu_supports("avx512bw")) { - const int arrsize = 1024; - auto arr = get_uniform_rand_array(arrsize); - std::vector sorted_inx; - if (std::is_floating_point::value) { - arr[0] = std::numeric_limits::quiet_NaN(); - arr[1] = std::numeric_limits::quiet_NaN(); - } - sorted_inx = std_argsort(arr); - std::vector kth; - for (int64_t ii = 0; ii < arrsize - 3; ++ii) { - kth.push_back(ii); - } - for (auto &k : kth) { - std::vector inx - = avx512_argselect(arr.data(), k, arr.size()); - auto true_kth = arr[sorted_inx[k]]; - EXPECT_EQ(true_kth, arr[inx[k]]) << "Failed at index k = " << k; - if (k >= 1) { - EXPECT_GE(true_kth, std_max_element(arr, inx, 0, k - 1)) - << "failed at k = " << k; - } - if (k != arrsize - 1) { - EXPECT_LE(true_kth, - std_min_element(arr, inx, k + 1, arrsize - 1)) - << "failed at k = " << k; - } - EXPECT_UNIQUE(inx) - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } -} - -REGISTER_TYPED_TEST_SUITE_P(avx512argselect, test_random); diff --git a/tests/test-argsort-common.h b/tests/test-argsort-common.h deleted file mode 100644 index 543bfaec..00000000 --- a/tests/test-argsort-common.h +++ /dev/null @@ -1,81 +0,0 @@ -#include "avx512-64bit-argsort.hpp" - -#include "rand_array.h" -#include -#include -#include - -template -std::vector std_argsort(const std::vector &arr) -{ - std::vector indices(arr.size()); - std::iota(indices.begin(), indices.end(), 0); - std::sort(indices.begin(), - indices.end(), - [&arr](int64_t left, int64_t right) -> bool { - if ((!std::isnan(arr[left])) && (!std::isnan(arr[right]))) { - return arr[left] < arr[right]; - } - else if (std::isnan(arr[left])) { - return false; - } - else { - return true; - } - }); - - return indices; -} - -template -T std_min_element(std::vector arr, - std::vector arg, - int64_t left, - int64_t right) -{ - std::vector::iterator res = std::min_element( - arg.begin() + left, - arg.begin() + right, - [arr](int64_t a, int64_t b) -> bool { - if ((!std::isnan(arr[a])) && (!std::isnan(arr[b]))) { - return arr[a] < arr[b]; - } - else if (std::isnan(arr[a])) { - return false; - } - else { - return true; - } - }); - return arr[*res]; -} - -template -T std_max_element(std::vector arr, - std::vector arg, - int64_t left, - int64_t right) -{ - std::vector::iterator res = std::max_element( - arg.begin() + left, - arg.begin() + right, - [arr](int64_t a, int64_t b) -> bool { - if ((!std::isnan(arr[a])) && (!std::isnan(arr[b]))) { - return arr[a] > arr[b]; - } - else if (std::isnan(arr[a])) { - return true; - } - else { - return false; - } - }); - return arr[*res]; -} - -#define EXPECT_UNIQUE(sorted_arg) \ - std::sort(sorted_arg.begin(), sorted_arg.end()); \ - std::vector expected_arg(sorted_arg.size()); \ - std::iota(expected_arg.begin(), expected_arg.end(), 0); \ - EXPECT_EQ(sorted_arg, expected_arg) \ - << "Indices aren't unique. Array size = " << sorted_arg.size(); diff --git a/tests/test-argsort.cpp b/tests/test-argsort.cpp deleted file mode 100644 index 41ce5ca4..00000000 --- a/tests/test-argsort.cpp +++ /dev/null @@ -1,9 +0,0 @@ -#include "test-argsort-common.h" -#include "test-argsort.hpp" -#include "test-argselect.hpp" - -using ArgTestTypes - = testing::Types; - -INSTANTIATE_TYPED_TEST_SUITE_P(T, avx512argsort, ArgTestTypes); -INSTANTIATE_TYPED_TEST_SUITE_P(T, avx512argselect, ArgTestTypes); diff --git a/tests/test-argsort.hpp b/tests/test-argsort.hpp deleted file mode 100644 index 62c3de60..00000000 --- a/tests/test-argsort.hpp +++ /dev/null @@ -1,272 +0,0 @@ -/******************************************* - * * Copyright (C) 2023 Intel Corporation - * * SPDX-License-Identifier: BSD-3-Clause - * *******************************************/ - -template -class avx512argsort : public ::testing::Test { -}; -TYPED_TEST_SUITE_P(avx512argsort); - -TYPED_TEST_P(avx512argsort, test_random) -{ - if (__builtin_cpu_supports("avx512bw")) { - std::vector arrsizes; - for (int64_t ii = 0; ii <= 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - for (auto &size : arrsizes) { - /* Random array */ - arr = get_uniform_rand_array(size); - std::vector inx1 = std_argsort(arr); - std::vector inx2 - = avx512_argsort(arr.data(), arr.size()); - std::vector sort1, sort2; - for (auto jj = 0; jj < size; ++jj) { - sort1.push_back(arr[inx1[jj]]); - sort2.push_back(arr[inx2[jj]]); - } - EXPECT_EQ(sort1, sort2) << "Array size =" << size; - EXPECT_UNIQUE(inx2) - arr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } -} - -TYPED_TEST_P(avx512argsort, test_constant) -{ - if (__builtin_cpu_supports("avx512bw")) { - std::vector arrsizes; - for (int64_t ii = 0; ii <= 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - for (auto &size : arrsizes) { - /* constant array */ - auto elem = get_uniform_rand_array(1)[0]; - for (auto jj = 0; jj < size; ++jj) { - arr.push_back(elem); - } - std::vector inx1 = std_argsort(arr); - std::vector inx2 - = avx512_argsort(arr.data(), arr.size()); - std::vector sort1, sort2; - for (auto jj = 0; jj < size; ++jj) { - sort1.push_back(arr[inx1[jj]]); - sort2.push_back(arr[inx2[jj]]); - } - EXPECT_EQ(sort1, sort2) << "Array size =" << size; - EXPECT_UNIQUE(inx2) - arr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } -} - -TYPED_TEST_P(avx512argsort, test_small_range) -{ - if (__builtin_cpu_supports("avx512bw")) { - std::vector arrsizes; - for (int64_t ii = 0; ii <= 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - for (auto &size : arrsizes) { - /* array with a smaller range of values */ - arr = get_uniform_rand_array(size, 20, 1); - std::vector inx1 = std_argsort(arr); - std::vector inx2 - = avx512_argsort(arr.data(), arr.size()); - std::vector sort1, sort2; - for (auto jj = 0; jj < size; ++jj) { - sort1.push_back(arr[inx1[jj]]); - sort2.push_back(arr[inx2[jj]]); - } - EXPECT_EQ(sort1, sort2) << "Array size = " << size; - EXPECT_UNIQUE(inx2) - arr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } -} - -TYPED_TEST_P(avx512argsort, test_sorted) -{ - if (__builtin_cpu_supports("avx512bw")) { - std::vector arrsizes; - for (int64_t ii = 0; ii <= 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - for (auto &size : arrsizes) { - arr = get_uniform_rand_array(size); - std::sort(arr.begin(), arr.end()); - std::vector inx1 = std_argsort(arr); - std::vector inx2 - = avx512_argsort(arr.data(), arr.size()); - std::vector sort1, sort2; - for (auto jj = 0; jj < size; ++jj) { - sort1.push_back(arr[inx1[jj]]); - sort2.push_back(arr[inx2[jj]]); - } - EXPECT_EQ(sort1, sort2) << "Array size =" << size; - EXPECT_UNIQUE(inx2) - arr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } -} - -TYPED_TEST_P(avx512argsort, test_reverse) -{ - if (__builtin_cpu_supports("avx512bw")) { - std::vector arrsizes; - for (int64_t ii = 0; ii <= 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - for (auto &size : arrsizes) { - arr = get_uniform_rand_array(size); - std::sort(arr.begin(), arr.end()); - std::reverse(arr.begin(), arr.end()); - std::vector inx1 = std_argsort(arr); - std::vector inx2 - = avx512_argsort(arr.data(), arr.size()); - std::vector sort1, sort2; - for (auto jj = 0; jj < size; ++jj) { - sort1.push_back(arr[inx1[jj]]); - sort2.push_back(arr[inx2[jj]]); - } - EXPECT_EQ(sort1, sort2) << "Array size =" << size; - EXPECT_UNIQUE(inx2) - arr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } -} - -TYPED_TEST_P(avx512argsort, test_array_with_nan) -{ - if (!__builtin_cpu_supports("avx512bw")) { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } - if (!std::is_floating_point::value) { - GTEST_SKIP() << "Skipping this test, it is meant for float/double"; - } - std::vector arrsizes; - for (int64_t ii = 2; ii <= 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - for (auto &size : arrsizes) { - arr = get_uniform_rand_array(size); - arr[0] = std::numeric_limits::quiet_NaN(); - arr[1] = std::numeric_limits::quiet_NaN(); - std::vector inx - = avx512_argsort(arr.data(), arr.size()); - std::vector sort1; - for (auto jj = 0; jj < size; ++jj) { - sort1.push_back(arr[inx[jj]]); - } - if ((!std::isnan(sort1[size - 1])) || (!std::isnan(sort1[size - 2]))) { - FAIL() << "NAN's aren't sorted to the end"; - } - if (!std::is_sorted(sort1.begin(), sort1.end() - 2)) { - FAIL() << "Array isn't sorted"; - } - EXPECT_UNIQUE(inx) - arr.clear(); - } -} - -TYPED_TEST_P(avx512argsort, test_max_value_at_end_of_array) -{ - if (!__builtin_cpu_supports("avx512bw")) { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } - std::vector arrsizes; - for (int64_t ii = 1; ii <= 256; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - for (auto &size : arrsizes) { - arr = get_uniform_rand_array(size); - if (std::numeric_limits::has_infinity) { - arr[size - 1] = std::numeric_limits::infinity(); - } - else { - arr[size - 1] = std::numeric_limits::max(); - } - std::vector inx = avx512_argsort(arr.data(), arr.size()); - std::vector sorted; - for (auto jj = 0; jj < size; ++jj) { - sorted.push_back(arr[inx[jj]]); - } - if (!std::is_sorted(sorted.begin(), sorted.end())) { - EXPECT_TRUE(false) << "Array of size " << size << "is not sorted"; - } - EXPECT_UNIQUE(inx) - arr.clear(); - } -} - -TYPED_TEST_P(avx512argsort, test_all_inf_array) -{ - if (!__builtin_cpu_supports("avx512bw")) { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } - std::vector arrsizes; - for (int64_t ii = 1; ii <= 256; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - for (auto &size : arrsizes) { - arr = get_uniform_rand_array(size); - if (std::numeric_limits::has_infinity) { - for (int64_t jj = 1; jj <= size; ++jj) { - if (rand() % 0x1) { - arr.push_back(std::numeric_limits::infinity()); - } - } - } - else { - for (int64_t jj = 1; jj <= size; ++jj) { - if (rand() % 0x1) { - arr.push_back(std::numeric_limits::max()); - } - } - } - std::vector inx = avx512_argsort(arr.data(), arr.size()); - std::vector sorted; - for (auto jj = 0; jj < size; ++jj) { - sorted.push_back(arr[inx[jj]]); - } - if (!std::is_sorted(sorted.begin(), sorted.end())) { - EXPECT_TRUE(false) << "Array of size " << size << "is not sorted"; - } - EXPECT_UNIQUE(inx) - arr.clear(); - } -} - -REGISTER_TYPED_TEST_SUITE_P(avx512argsort, - test_random, - test_reverse, - test_constant, - test_sorted, - test_small_range, - test_all_inf_array, - test_array_with_nan, - test_max_value_at_end_of_array); diff --git a/tests/test-keyvalue.cpp b/tests/test-keyvalue.cpp deleted file mode 100644 index 6e75f344..00000000 --- a/tests/test-keyvalue.cpp +++ /dev/null @@ -1,87 +0,0 @@ -/******************************************* - * * Copyright (C) 2022 Intel Corporation - * * SPDX-License-Identifier: BSD-3-Clause - * *******************************************/ - -#include "avx512-64bit-keyvaluesort.hpp" - -#include "rand_array.h" -#include -#include -#define inf X86_SIMD_SORT_INFINITY - -template -struct sorted_t { - K key; - K value; -}; - -template -bool compare(sorted_t a, sorted_t b) -{ - return a.key == b.key ? a.value < b.value : a.key < b.key; -} - -template -class KeyValueSort : public ::testing::Test { -}; - -TYPED_TEST_SUITE_P(KeyValueSort); - -TYPED_TEST_P(KeyValueSort, test_64bit_random_data) -{ - if (__builtin_cpu_supports("avx512bw")) { - std::vector keysizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - keysizes.push_back((TypeParam)ii); - } - std::vector keys; - std::vector values; - std::vector> sortedarr; - - for (size_t ii = 0; ii < keysizes.size(); ++ii) { - /* Random array */ - keys = get_uniform_rand_array_with_uniquevalues( - keysizes[ii]); - values = get_uniform_rand_array(keysizes[ii]); - for (size_t i = 0; i < keys.size(); i++) { - sorted_t tmp_s; - tmp_s.key = keys[i]; - tmp_s.value = values[i]; - sortedarr.emplace_back(tmp_s); - } - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), - sortedarr.end(), - compare); - avx512_qsort_kv(keys.data(), values.data(), keys.size()); - for (size_t i = 0; i < keys.size(); i++) { - ASSERT_EQ(keys[i], sortedarr[i].key); - ASSERT_EQ(values[i], sortedarr[i].value); - } - keys.clear(); - values.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } -} - -TEST(KeyValueSort, test_inf_at_endofarray) -{ - std::vector key = {8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, inf}; - std::vector key_sorted - = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, inf}; - std::vector val = {7, 6, 5, 4, 3, 2, 1, 0, 8}; - std::vector val_sorted = {0, 1, 2, 3, 4, 5, 6, 7, 8}; - avx512_qsort_kv(key.data(), val.data(), key.size()); - ASSERT_EQ(key, key_sorted); - ASSERT_EQ(val, val_sorted); -} - -REGISTER_TYPED_TEST_SUITE_P(KeyValueSort, test_64bit_random_data); - -using TypesKv = testing::Types; -INSTANTIATE_TYPED_TEST_SUITE_P(T, KeyValueSort, TypesKv); diff --git a/tests/test-partial-qsort.hpp b/tests/test-partial-qsort.hpp deleted file mode 100644 index fee3d9f3..00000000 --- a/tests/test-partial-qsort.hpp +++ /dev/null @@ -1,51 +0,0 @@ -#include "test-qsort-common.h" - -template -class avx512_partial_sort : public ::testing::Test { -}; -TYPED_TEST_SUITE_P(avx512_partial_sort); - -TYPED_TEST_P(avx512_partial_sort, test_ranges) -{ - int64_t arrsize = 1024; - int64_t nranges = 500; - - if (__builtin_cpu_supports("avx512bw")) { - if ((sizeof(TypeParam) == 2) - && (!__builtin_cpu_supports("avx512vbmi2"))) { - GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; - } - std::vector arr; - std::vector sortedarr; - std::vector psortedarr; - /* Random array */ - arr = get_uniform_rand_array(arrsize); - sortedarr = arr; - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - - for (auto ii = 0; ii < nranges; ++ii) { - psortedarr = arr; - - /* Pick a random number of elements to sort at the beginning of the array */ - int k = get_uniform_rand_array(1, arrsize, 1).front(); - - /* Sort the range and verify all the required elements match the presorted set */ - avx512_partial_qsort( - psortedarr.data(), k, psortedarr.size()); - for (auto jj = 0; jj < k; jj++) { - ASSERT_EQ(sortedarr[jj], psortedarr[jj]); - } - - psortedarr.clear(); - } - - arr.clear(); - sortedarr.clear(); - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } -} - -REGISTER_TYPED_TEST_SUITE_P(avx512_partial_sort, test_ranges); diff --git a/tests/test-qselect.hpp b/tests/test-qselect.hpp deleted file mode 100644 index b35d5486..00000000 --- a/tests/test-qselect.hpp +++ /dev/null @@ -1,112 +0,0 @@ -#include "test-qsort-common.h" - -template -class avx512_select : public ::testing::Test { -}; -TYPED_TEST_SUITE_P(avx512_select); - -#ifdef __FLT16_MAX__ -TEST(avx512_select, test_simple) -{ - if (__builtin_cpu_supports("avx512vbmi2")) { - std::vector<_Float16> arr{1.0, -1.0}; - avx512_qselect_fp16(reinterpret_cast(arr.data()), 0, arr.size()); - ASSERT_EQ(arr[0], -1.0); - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512vbmi2"; - } -} -#endif - -TYPED_TEST_P(avx512_select, test_random) -{ - if (__builtin_cpu_supports("avx512bw")) { - if ((sizeof(TypeParam) == 2) - && (!__builtin_cpu_supports("avx512vbmi2"))) { - GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; - } - std::vector arrsizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - std::vector sortedarr; - std::vector psortedarr; - for (size_t ii = 0; ii < arrsizes.size(); ++ii) { - /* Random array */ - arr = get_uniform_rand_array(arrsizes[ii]); - sortedarr = arr; - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - for (size_t k = 0; k < arr.size(); ++k) { - psortedarr = arr; - avx512_qselect( - psortedarr.data(), k, psortedarr.size()); - /* index k is correct */ - ASSERT_EQ(sortedarr[k], psortedarr[k]); - /* Check left partition */ - for (size_t jj = 0; jj < k; jj++) { - ASSERT_LE(psortedarr[jj], psortedarr[k]); - } - /* Check right partition */ - for (size_t jj = k + 1; jj < arr.size(); jj++) { - ASSERT_GE(psortedarr[jj], psortedarr[k]); - } - psortedarr.clear(); - } - arr.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } -} - -TYPED_TEST_P(avx512_select, test_small_range) -{ - if (__builtin_cpu_supports("avx512bw")) { - if ((sizeof(TypeParam) == 2) - && (!__builtin_cpu_supports("avx512vbmi2"))) { - GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; - } - std::vector arrsizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - std::vector sortedarr; - std::vector psortedarr; - for (size_t ii = 0; ii < arrsizes.size(); ++ii) { - /* Random array */ - arr = get_uniform_rand_array(arrsizes[ii], 20, 1); - sortedarr = arr; - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - for (size_t k = 0; k < arr.size(); ++k) { - psortedarr = arr; - avx512_qselect( - psortedarr.data(), k, psortedarr.size()); - /* index k is correct */ - ASSERT_EQ(sortedarr[k], psortedarr[k]); - /* Check left partition */ - for (size_t jj = 0; jj < k; jj++) { - ASSERT_LE(psortedarr[jj], psortedarr[k]); - } - /* Check right partition */ - for (size_t jj = k + 1; jj < arr.size(); jj++) { - ASSERT_GE(psortedarr[jj], psortedarr[k]); - } - psortedarr.clear(); - } - arr.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } -} - -REGISTER_TYPED_TEST_SUITE_P(avx512_select, test_random, test_small_range); diff --git a/tests/test-qsort-common.h b/tests/test-qsort-common.h index 9690265a..d8177ee0 100644 --- a/tests/test-qsort-common.h +++ b/tests/test-qsort-common.h @@ -1,11 +1,148 @@ #ifndef AVX512_TEST_COMMON #define AVX512_TEST_COMMON -#include "avx512-16bit-qsort.hpp" -#include "avx512-32bit-qsort.hpp" -#include "avx512-64bit-qsort.hpp" - #include "rand_array.h" +#include "x86simdsort.h" #include +#define EXPECT_UNIQUE(arg) \ + auto sorted_arg = arg; \ + std::sort(sorted_arg.begin(), sorted_arg.end()); \ + std::vector expected_arg(sorted_arg.size()); \ + std::iota(expected_arg.begin(), expected_arg.end(), 0); \ + EXPECT_EQ(sorted_arg, expected_arg) \ + << "Indices aren't unique. Array size = " << sorted_arg.size(); + +#define REPORT_FAIL(msg, size, type, k) \ + ASSERT_TRUE(false) << msg << ". arr size = " << size \ + << ", type = " << type << ", k = " << k; + +/* + * Custom comparator class to handle NAN's: treats NAN > INF + */ +template +struct compare { + static constexpr auto op = Comparator {}; + bool operator()(const T a, const T b) + { + if constexpr (std::is_floating_point_v) { + T inf = std::numeric_limits::infinity(); + if (!std::isunordered(a, b)) { return op(a, b); } + else if ((std::isnan(a)) && (!std::isnan(b))) { + return b == inf ? op(inf, 1.) : op(inf, b); + } + else if ((!std::isnan(a)) && (std::isnan(b))) { + return a == inf ? op(1., inf) : op(a, inf); + } + else { + return op(1., 1.); + } + } + else { + return op(a, b); + } + } +}; + +//template +//struct compare_arg { +// compare_arg(std::vector arr) +// { +// this->arr = arr; +// } +// bool operator()(const int64_t a, const int64_t b) +// { +// return compare()(arr[a], arr[b]); +// } +// std::vector arr; +//}; + +template +void IS_SORTED(std::vector sorted, std::vector arr, std::string type) +{ + if constexpr (std::is_floating_point_v) { + auto cmp_func = compare>(); + if (!std::is_sorted(arr.begin(), arr.end(), cmp_func)) { + REPORT_FAIL("Array not sorted", arr.size(), type, -1); + } + } + else { + if (memcmp(arr.data(), sorted.data(), arr.size() * sizeof(T) != 0)) { + REPORT_FAIL("Array not sorted", arr.size(), type, -1); + } + } +} + +template +void IS_ARG_SORTED(std::vector sortedarr, + std::vector arr, + std::vector arg, + std::string type) +{ + EXPECT_UNIQUE(arg) + std::vector arr_backup; + for (auto ii : arg) { + arr_backup.push_back(arr[ii]); + } + IS_SORTED(sortedarr, arr_backup, type); +} + +template +void IS_ARR_PARTITIONED(std::vector arr, + int64_t k, + T true_kth, + std::string type) +{ + auto cmp_eq = compare>(); + auto cmp_less = compare>(); + auto cmp_leq = compare>(); + auto cmp_geq = compare>(); + + // 1) arr[k] == sorted[k]; use memcmp to handle nan + if (!cmp_eq(arr[k], true_kth)) { + REPORT_FAIL("kth element is incorrect", arr.size(), type, k); + } + // ( 2) Elements to the left of k should be atmost arr[k] + if (k >= 1) { + T max_left + = *std::max_element(arr.begin(), arr.begin() + k - 1, cmp_less); + if (!cmp_geq(arr[k], max_left)) { + REPORT_FAIL("incorrect left partition", arr.size(), type, k); + } + } + // 3) Elements to the right of k should be atleast arr[k] + if (k != (int64_t)(arr.size() - 1)) { + T min_right + = *std::min_element(arr.begin() + k + 1, arr.end(), cmp_less); + if (!cmp_leq(arr[k], min_right)) { + REPORT_FAIL("incorrect right partition", arr.size(), type, k); + } + } +} + +template +void IS_ARR_PARTIALSORTED(std::vector arr, + int64_t k, + std::vector sorted, + std::string type) +{ + if (memcmp(arr.data(), sorted.data(), k * sizeof(T)) != 0) { + REPORT_FAIL("Partial array not sorted", arr.size(), type, k); + } +} + +template +void IS_ARG_PARTITIONED(std::vector arr, + std::vector arg, + T true_kth, + int64_t k, + std::string type) +{ + EXPECT_UNIQUE(arg) + std::vector part_arr; + for (auto ii : arg) { + part_arr.push_back(arr[ii]); + } + IS_ARR_PARTITIONED(part_arr, k, true_kth, type); +} #endif diff --git a/tests/test-qsort-fp.hpp b/tests/test-qsort-fp.hpp deleted file mode 100644 index 8309d509..00000000 --- a/tests/test-qsort-fp.hpp +++ /dev/null @@ -1,49 +0,0 @@ -/******************************************* - * * Copyright (C) 2022 Intel Corporation - * * SPDX-License-Identifier: BSD-3-Clause - * *******************************************/ - -#include "test-qsort-common.h" - -template -class avx512_sort_fp : public ::testing::Test { -}; -TYPED_TEST_SUITE_P(avx512_sort_fp); - -TYPED_TEST_P(avx512_sort_fp, test_random_nan) -{ - const int num_nans = 3; - if (!__builtin_cpu_supports("avx512bw")) { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } - std::vector arrsizes; - for (int64_t ii = num_nans; ii < 1024; ++ii) { - arrsizes.push_back((TypeParam)ii); - } - std::vector arr; - std::vector sortedarr; - for (auto &size : arrsizes) { - /* Random array */ - arr = get_uniform_rand_array(size); - for (auto ii = 1; ii <= num_nans; ++ii) { - arr[size - ii] = std::numeric_limits::quiet_NaN(); - } - sortedarr = arr; - std::sort(sortedarr.begin(), sortedarr.end() - 3); - std::random_shuffle(arr.begin(), arr.end()); - avx512_qsort(arr.data(), arr.size()); - for (auto ii = 1; ii <= num_nans; ++ii) { - if (!std::isnan(arr[size - ii])) { - ASSERT_TRUE(false) - << "NAN's aren't sorted to the end. Arr size = " - << size; - } - } - if (!std::is_sorted(arr.begin(), arr.end() - num_nans)) { - ASSERT_TRUE(true) << "Array isn't sorted"; - } - arr.clear(); - sortedarr.clear(); - } -} -REGISTER_TYPED_TEST_SUITE_P(avx512_sort_fp, test_random_nan); diff --git a/tests/test-qsort.cpp b/tests/test-qsort.cpp index a35d8e8c..6cc0553f 100644 --- a/tests/test-qsort.cpp +++ b/tests/test-qsort.cpp @@ -1,7 +1,162 @@ -#include "test-qsort.hpp" -#include "test-partial-qsort.hpp" -#include "test-qselect.hpp" -#include "test-qsort-fp.hpp" +/******************************************* + * * Copyright (C) 2022 Intel Corporation + * * SPDX-License-Identifier: BSD-3-Clause + * *******************************************/ + +#include "test-qsort-common.h" + +template +class simdsort : public ::testing::Test { +public: + simdsort() + { + std::iota(arrsize.begin(), arrsize.end(), 1); + arrtype = {"random", + "constant", + "sorted", + "reverse", + "smallrange", + "max_at_the_end", + "rand_max", + "rand_with_nan"}; + } + std::vector arrtype; + std::vector arrsize = std::vector(1024); +}; + +TYPED_TEST_SUITE_P(simdsort); + +TYPED_TEST_P(simdsort, test_qsort) +{ + for (auto type : this->arrtype) { + for (auto size : this->arrsize) { + std::vector arr = get_array(type, size); + std::vector sortedarr = arr; + std::sort(sortedarr.begin(), + sortedarr.end(), + compare>()); + x86simdsort::qsort(arr.data(), arr.size()); + IS_SORTED(sortedarr, arr, type); + arr.clear(); + sortedarr.clear(); + } + } +} + +TYPED_TEST_P(simdsort, test_argsort) +{ + for (auto type : this->arrtype) { + for (auto size : this->arrsize) { + std::vector arr = get_array(type, size); + std::vector sortedarr = arr; + std::sort(sortedarr.begin(), + sortedarr.end(), + compare>()); + auto arg = x86simdsort::argsort(arr.data(), arr.size()); + IS_ARG_SORTED(sortedarr, arr, arg, type); + arr.clear(); + arg.clear(); + } + } +} + +TYPED_TEST_P(simdsort, test_qselect) +{ + for (auto type : this->arrtype) { + for (auto size : this->arrsize) { + int64_t k = rand() % size; + std::vector arr = get_array(type, size); + std::vector sortedarr = arr; + std::nth_element(sortedarr.begin(), + sortedarr.begin() + k, + sortedarr.end(), + compare>()); + x86simdsort::qselect(arr.data(), k, arr.size(), true); + IS_ARR_PARTITIONED(arr, k, sortedarr[k], type); + arr.clear(); + sortedarr.clear(); + } + } +} + +TYPED_TEST_P(simdsort, test_argselect) +{ + for (auto type : this->arrtype) { + for (auto size : this->arrsize) { + int64_t k = rand() % size; + std::vector arr = get_array(type, size); + std::vector sortedarr = arr; + std::sort(sortedarr.begin(), + sortedarr.end(), + compare>()); + auto arg = x86simdsort::argselect(arr.data(), k, arr.size()); + auto arg1 = x86simdsort::argsort(arr.data(), arr.size()); + IS_ARG_PARTITIONED(arr, arg, sortedarr[k], k, type); + arr.clear(); + sortedarr.clear(); + } + } +} + +TYPED_TEST_P(simdsort, test_partial_qsort) +{ + for (auto type : this->arrtype) { + for (auto size : this->arrsize) { + // k should be at least 1 + int64_t k = std::max(0x1l, rand() % size); + std::vector arr = get_array(type, size); + std::vector sortedarr = arr; + std::sort(sortedarr.begin(), + sortedarr.end(), + compare>()); + x86simdsort::partial_qsort(arr.data(), k, arr.size(), true); + IS_ARR_PARTIALSORTED(arr, k, sortedarr, type); + arr.clear(); + sortedarr.clear(); + } + } +} + +TYPED_TEST_P(simdsort, test_comparator) +{ + if constexpr (std::is_floating_point_v) { + auto less = compare>(); + auto leq = compare>(); + auto greater = compare>(); + auto geq = compare>(); + auto equal = compare>(); + TypeParam nan = std::numeric_limits::quiet_NaN(); + TypeParam inf = std::numeric_limits::infinity(); + ASSERT_EQ(less(nan, inf), false); + ASSERT_EQ(less(nan, nan), false); + ASSERT_EQ(less(inf, nan), true); + ASSERT_EQ(less(inf, inf), false); + ASSERT_EQ(leq(nan, inf), false); + ASSERT_EQ(leq(nan, nan), true); + ASSERT_EQ(leq(inf, nan), true); + ASSERT_EQ(leq(inf, inf), true); + ASSERT_EQ(geq(nan, inf), true); + ASSERT_EQ(geq(nan, nan), true); + ASSERT_EQ(geq(inf, nan), false); + ASSERT_EQ(geq(inf, inf), true); + ASSERT_EQ(greater(nan, inf), true); + ASSERT_EQ(greater(nan, nan), false); + ASSERT_EQ(greater(inf, nan), false); + ASSERT_EQ(greater(inf, inf), false); + ASSERT_EQ(equal(nan, inf), false); + ASSERT_EQ(equal(nan, nan), true); + ASSERT_EQ(equal(inf, nan), false); + ASSERT_EQ(equal(inf, inf), true); + } +} + +REGISTER_TYPED_TEST_SUITE_P(simdsort, + test_qsort, + test_argsort, + test_argselect, + test_qselect, + test_partial_qsort, + test_comparator); using QSortTestTypes = testing::Types; -using QSortTestFPTypes = testing::Types; - -INSTANTIATE_TYPED_TEST_SUITE_P(T, avx512_sort, QSortTestTypes); -INSTANTIATE_TYPED_TEST_SUITE_P(T, avx512_sort_fp, QSortTestFPTypes); -INSTANTIATE_TYPED_TEST_SUITE_P(T, avx512_select, QSortTestTypes); -INSTANTIATE_TYPED_TEST_SUITE_P(T, avx512_partial_sort, QSortTestTypes); +INSTANTIATE_TYPED_TEST_SUITE_P(xss, simdsort, QSortTestTypes); diff --git a/tests/test-qsort.hpp b/tests/test-qsort.hpp deleted file mode 100644 index d6c1d85a..00000000 --- a/tests/test-qsort.hpp +++ /dev/null @@ -1,172 +0,0 @@ -/******************************************* - * * Copyright (C) 2022 Intel Corporation - * * SPDX-License-Identifier: BSD-3-Clause - * *******************************************/ - -#include "test-qsort-common.h" - -template -class avx512_sort : public ::testing::Test { -}; -TYPED_TEST_SUITE_P(avx512_sort); - -TYPED_TEST_P(avx512_sort, test_random) -{ - if (__builtin_cpu_supports("avx512bw")) { - if ((sizeof(TypeParam) == 2) - && (!__builtin_cpu_supports("avx512vbmi2"))) { - GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; - } - std::vector arrsizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - arrsizes.push_back((TypeParam)ii); - } - std::vector arr; - std::vector sortedarr; - for (size_t ii = 0; ii < arrsizes.size(); ++ii) { - /* Random array */ - arr = get_uniform_rand_array(arrsizes[ii]); - sortedarr = arr; - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - avx512_qsort(arr.data(), arr.size()); - ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii]; - arr.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } -} - -TYPED_TEST_P(avx512_sort, test_reverse) -{ - if (__builtin_cpu_supports("avx512bw")) { - if ((sizeof(TypeParam) == 2) - && (!__builtin_cpu_supports("avx512vbmi2"))) { - GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; - } - std::vector arrsizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - arrsizes.push_back((TypeParam)(ii + 1)); - } - std::vector arr; - std::vector sortedarr; - for (size_t ii = 0; ii < arrsizes.size(); ++ii) { - /* reverse array */ - for (int jj = 0; jj < arrsizes[ii]; ++jj) { - arr.push_back((TypeParam)(arrsizes[ii] - jj)); - } - sortedarr = arr; - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - avx512_qsort(arr.data(), arr.size()); - ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii]; - arr.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } -} - -TYPED_TEST_P(avx512_sort, test_constant) -{ - if (__builtin_cpu_supports("avx512bw")) { - if ((sizeof(TypeParam) == 2) - && (!__builtin_cpu_supports("avx512vbmi2"))) { - GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; - } - std::vector arrsizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - arrsizes.push_back((TypeParam)(ii + 1)); - } - std::vector arr; - std::vector sortedarr; - for (size_t ii = 0; ii < arrsizes.size(); ++ii) { - /* constant array */ - for (int jj = 0; jj < arrsizes[ii]; ++jj) { - arr.push_back(ii); - } - sortedarr = arr; - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - avx512_qsort(arr.data(), arr.size()); - ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii]; - arr.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } -} - -TYPED_TEST_P(avx512_sort, test_small_range) -{ - if (__builtin_cpu_supports("avx512bw")) { - if ((sizeof(TypeParam) == 2) - && (!__builtin_cpu_supports("avx512vbmi2"))) { - GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; - } - std::vector arrsizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - arrsizes.push_back((TypeParam)(ii + 1)); - } - std::vector arr; - std::vector sortedarr; - for (size_t ii = 0; ii < arrsizes.size(); ++ii) { - arr = get_uniform_rand_array(arrsizes[ii], 20, 1); - sortedarr = arr; - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - avx512_qsort(arr.data(), arr.size()); - ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii]; - arr.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512bw"; - } -} - -TYPED_TEST_P(avx512_sort, test_max_value_at_end_of_array) -{ - if (!__builtin_cpu_supports("avx512bw")) { - GTEST_SKIP() << "Skipping this test, it requires avx512bw ISA"; - } - if ((sizeof(TypeParam) == 2) && (!__builtin_cpu_supports("avx512vbmi2"))) { - GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2"; - } - std::vector arrsizes; - for (int64_t ii = 1; ii <= 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector arr; - std::vector sortedarr; - for (auto &size : arrsizes) { - arr = get_uniform_rand_array(size); - if (std::numeric_limits::has_infinity) { - arr[size - 1] = std::numeric_limits::infinity(); - } - else { - arr[size - 1] = std::numeric_limits::max(); - } - sortedarr = arr; - avx512_qsort(arr.data(), arr.size()); - std::sort(sortedarr.begin(), sortedarr.end()); - EXPECT_EQ(sortedarr, arr) << "Array size = " << size; - arr.clear(); - sortedarr.clear(); - } -} - -REGISTER_TYPED_TEST_SUITE_P(avx512_sort, - test_random, - test_reverse, - test_constant, - test_small_range, - test_max_value_at_end_of_array); diff --git a/tests/test-qsortfp16.cpp b/tests/test-qsortfp16.cpp deleted file mode 100644 index d1bd985a..00000000 --- a/tests/test-qsortfp16.cpp +++ /dev/null @@ -1,161 +0,0 @@ -/******************************************* - * * Copyright (C) 2022 Intel Corporation - * * SPDX-License-Identifier: BSD-3-Clause - * *******************************************/ - -#include "avx512fp16-16bit-qsort.hpp" - -#include "rand_array.h" -#include -#include - -TEST(avx512_qsort_float16, test_arrsizes) -{ - if (__builtin_cpu_supports("avx512fp16")) { - std::vector arrsizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector<_Float16> arr; - std::vector<_Float16> sortedarr; - - for (size_t ii = 0; ii < arrsizes.size(); ++ii) { - /* Random array */ - for (auto jj = 0; jj < arrsizes[ii]; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - sortedarr.push_back(temp); - } - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - avx512_qsort<_Float16>(arr.data(), arr.size()); - ASSERT_EQ(sortedarr, arr); - arr.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA"; - } -} - -TEST(avx512_qsort_float16, test_special_floats) -{ - if (__builtin_cpu_supports("avx512fp16")) { - const int arrsize = 1111; - std::vector<_Float16> arr; - std::vector<_Float16> sortedarr; - Fp16Bits temp; - for (size_t jj = 0; jj < arrsize; ++jj) { - temp.f_ = (float)rand() / (float)(RAND_MAX); - switch (rand() % 10) { - case 0: temp.i_ = 0xFFFF; break; - case 1: temp.i_ = X86_SIMD_SORT_INFINITYH; break; - case 2: temp.i_ = X86_SIMD_SORT_NEGINFINITYH; break; - default: break; - } - arr.push_back(temp.f_); - sortedarr.push_back(temp.f_); - } - /* Cannot use std::sort because it treats NAN differently */ - avx512_qsort_fp16(reinterpret_cast(sortedarr.data()), - sortedarr.size()); - avx512_qsort<_Float16>(arr.data(), arr.size()); - // Cannot rely on ASSERT_EQ since it returns false if there are NAN's - if (memcmp(arr.data(), sortedarr.data(), arrsize * 2) != 0) { - ASSERT_EQ(sortedarr, arr); - } - arr.clear(); - sortedarr.clear(); - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA"; - } -} - -TEST(avx512_qselect_float16, test_arrsizes) -{ - if (__builtin_cpu_supports("avx512fp16")) { - std::vector arrsizes; - for (int64_t ii = 0; ii < 1024; ++ii) { - arrsizes.push_back(ii); - } - std::vector<_Float16> arr; - std::vector<_Float16> sortedarr; - std::vector<_Float16> psortedarr; - - for (size_t ii = 0; ii < arrsizes.size(); ++ii) { - /* Random array */ - for (auto jj = 0; jj < arrsizes[ii]; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - sortedarr.push_back(temp); - } - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - for (size_t k = 0; k < arr.size(); ++k) { - psortedarr = arr; - avx512_qselect<_Float16>( - psortedarr.data(), k, psortedarr.size()); - /* index k is correct */ - ASSERT_EQ(sortedarr[k], psortedarr[k]); - /* Check left partition */ - for (size_t jj = 0; jj < k; jj++) { - ASSERT_LE(psortedarr[jj], psortedarr[k]); - } - /* Check right partition */ - for (size_t jj = k + 1; jj < arr.size(); jj++) { - ASSERT_GE(psortedarr[jj], psortedarr[k]); - } - psortedarr.clear(); - } - arr.clear(); - sortedarr.clear(); - } - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA"; - } -} - -TEST(avx512_partial_qsort_float16, test_ranges) -{ - if (__builtin_cpu_supports("avx512fp16")) { - int64_t arrsize = 1024; - int64_t nranges = 500; - - std::vector<_Float16> arr; - std::vector<_Float16> sortedarr; - std::vector<_Float16> psortedarr; - - /* Random array */ - for (auto ii = 0; ii < arrsize; ++ii) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - sortedarr.push_back(temp); - } - /* Sort with std::sort for comparison */ - std::sort(sortedarr.begin(), sortedarr.end()); - - for (auto ii = 0; ii < nranges; ++ii) { - psortedarr = arr; - - int k = get_uniform_rand_array(1, arrsize, 1).front(); - - /* Sort the range and verify all the required elements match the presorted set */ - avx512_partial_qsort<_Float16>( - psortedarr.data(), k, psortedarr.size()); - for (auto jj = 0; jj < k; jj++) { - ASSERT_EQ(sortedarr[jj], psortedarr[jj]); - } - - psortedarr.clear(); - } - - arr.clear(); - sortedarr.clear(); - } - else { - GTEST_SKIP() << "Skipping this test, it requires avx512fp16 ISA"; - } -} diff --git a/utils/rand_array.h b/utils/rand_array.h index 076cf8e4..0400300a 100644 --- a/utils/rand_array.h +++ b/utils/rand_array.h @@ -53,25 +53,74 @@ get_uniform_rand_array_with_uniquevalues(int64_t arrsize, template static std::vector -get_array(std::string arrtype, int64_t ARRSIZE) +get_array(std::string arrtype, + int64_t arrsize, + T min = std::numeric_limits::min(), + T max = std::numeric_limits::max()) { std::vector arr; - if (arrtype == "random") { arr = get_uniform_rand_array(ARRSIZE); } + if (arrtype == "random") { arr = get_uniform_rand_array(arrsize, max, min); } else if (arrtype == "sorted") { - arr = get_uniform_rand_array(ARRSIZE); + arr = get_uniform_rand_array(arrsize, max, min); std::sort(arr.begin(), arr.end()); } else if (arrtype == "constant") { - T temp = get_uniform_rand_array(1)[0]; - for (auto ii = 0; ii < ARRSIZE; ++ii) { + T temp = get_uniform_rand_array(1, max, min)[0]; + for (auto ii = 0; ii < arrsize; ++ii) { arr.push_back(temp); } } else if (arrtype == "reverse") { - arr = get_uniform_rand_array(ARRSIZE); + arr = get_uniform_rand_array(arrsize, max, min); std::sort(arr.begin(), arr.end()); std::reverse(arr.begin(), arr.end()); } + else if (arrtype == "smallrange") { + arr = get_uniform_rand_array(arrsize, 10, 1); + } + else if (arrtype == "max_at_the_end") { + arr = get_uniform_rand_array(arrsize, max, min); + if (std::numeric_limits::has_infinity) { + arr[arrsize - 1] = std::numeric_limits::infinity(); + } + else { + arr[arrsize - 1] = std::numeric_limits::max(); + } + } + else if (arrtype == "rand_with_nan") { + arr = get_uniform_rand_array(arrsize, max, min); + int64_t num_nans = 10 % arrsize; + std::vector rand_indx + = get_uniform_rand_array(num_nans, arrsize-1, 0); + T val; + if constexpr (std::is_floating_point_v) { + val = std::numeric_limits::quiet_NaN(); + } + else { + val = std::numeric_limits::max(); + } + for (auto ind : rand_indx) { + arr[ind] = val; + } + } + else if (arrtype == "rand_max") { + arr = get_uniform_rand_array(arrsize, max, min); + T val; + if constexpr (std::is_floating_point_v) { + val = std::numeric_limits::infinity(); + } + else { + val = std::numeric_limits::max(); + } + for (auto ii = 1; ii <= arrsize; ++ii) { + if (rand() % 0x1) { + arr[ii] = val; + } + } + } + else { + std::cout << "Warning: unrecognized array type " << arrtype << std::endl; + } return arr; } From c25661e756e30a90550a8159459147b7dd838fec Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 22 Sep 2023 13:26:36 -0700 Subject: [PATCH 07/23] Fix scalar methods to treat NAN correctlt --- lib/x86simdsort-scalar.h | 32 ++++++++++++++++-------------- meson.build | 2 +- tests/test-qsort-common.h | 41 +-------------------------------------- utils/custom-compare.h | 41 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 56 deletions(-) create mode 100644 utils/custom-compare.h diff --git a/lib/x86simdsort-scalar.h b/lib/x86simdsort-scalar.h index 41bd3724..f3b2b4ac 100644 --- a/lib/x86simdsort-scalar.h +++ b/lib/x86simdsort-scalar.h @@ -1,36 +1,40 @@ #include #include -#define UNUSED(x) (void)(x) +#include "custom-compare.h" + namespace xss { namespace scalar { - /* TODO: handle NAN */ template void qsort(T *arr, int64_t arrsize) { - std::sort(arr, arr + arrsize); + std::sort(arr, arr + arrsize, compare>()); } template void qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan) { - UNUSED(hasnan); - std::nth_element(arr, arr + k, arr + arrsize); + if (hasnan) { + std::nth_element(arr, arr + k, arr + arrsize, compare>()); + } + else { + std::nth_element(arr, arr + k, arr + arrsize); + } } template void partial_qsort(T *arr, int64_t k, int64_t arrsize, bool hasnan) { - UNUSED(hasnan); - std::partial_sort(arr, arr + k, arr + arrsize); + if (hasnan) { + std::partial_sort(arr, arr + k, arr + arrsize, compare>()); + } + else { + std::partial_sort(arr, arr + k, arr + arrsize); + } } template std::vector argsort(T *arr, int64_t arrsize) { std::vector arg(arrsize); std::iota(arg.begin(), arg.end(), 0); - std::sort(arg.begin(), - arg.end(), - [arr](int64_t left, int64_t right) -> bool { - return arr[left] < arr[right]; - }); + std::sort(arg.begin(), arg.end(), compare_arg>(arr)); return arg; } template @@ -41,9 +45,7 @@ namespace scalar { std::nth_element(arg.begin(), arg.begin() + k, arg.end(), - [arr](int64_t left, int64_t right) -> bool { - return arr[left] < arr[right]; - }); + compare_arg>(arr)); return arg; } diff --git a/meson.build b/meson.build index 32364f45..1de2036f 100644 --- a/meson.build +++ b/meson.build @@ -26,7 +26,7 @@ subdir('benchmarks') libsimdsort = shared_library('x86simdsort', 'lib/x86simdsort.cpp', - include_directories : [lib], + include_directories : [utils, lib], link_whole : [libtargets], cpp_args : ['-O3'], ) diff --git a/tests/test-qsort-common.h b/tests/test-qsort-common.h index d8177ee0..8a08f61d 100644 --- a/tests/test-qsort-common.h +++ b/tests/test-qsort-common.h @@ -2,6 +2,7 @@ #define AVX512_TEST_COMMON #include "rand_array.h" +#include "custom-compare.h" #include "x86simdsort.h" #include @@ -17,46 +18,6 @@ ASSERT_TRUE(false) << msg << ". arr size = " << size \ << ", type = " << type << ", k = " << k; -/* - * Custom comparator class to handle NAN's: treats NAN > INF - */ -template -struct compare { - static constexpr auto op = Comparator {}; - bool operator()(const T a, const T b) - { - if constexpr (std::is_floating_point_v) { - T inf = std::numeric_limits::infinity(); - if (!std::isunordered(a, b)) { return op(a, b); } - else if ((std::isnan(a)) && (!std::isnan(b))) { - return b == inf ? op(inf, 1.) : op(inf, b); - } - else if ((!std::isnan(a)) && (std::isnan(b))) { - return a == inf ? op(1., inf) : op(a, inf); - } - else { - return op(1., 1.); - } - } - else { - return op(a, b); - } - } -}; - -//template -//struct compare_arg { -// compare_arg(std::vector arr) -// { -// this->arr = arr; -// } -// bool operator()(const int64_t a, const int64_t b) -// { -// return compare()(arr[a], arr[b]); -// } -// std::vector arr; -//}; - template void IS_SORTED(std::vector sorted, std::vector arr, std::string type) { diff --git a/utils/custom-compare.h b/utils/custom-compare.h new file mode 100644 index 00000000..21c414ef --- /dev/null +++ b/utils/custom-compare.h @@ -0,0 +1,41 @@ +#include +#include +/* + * Custom comparator class to handle NAN's: treats NAN > INF + */ +template +struct compare { + static constexpr auto op = Comparator {}; + bool operator()(const T a, const T b) + { + if constexpr (std::is_floating_point_v) { + T inf = std::numeric_limits::infinity(); + if (!std::isunordered(a, b)) { return op(a, b); } + else if ((std::isnan(a)) && (!std::isnan(b))) { + return b == inf ? op(inf, 1.) : op(inf, b); + } + else if ((!std::isnan(a)) && (std::isnan(b))) { + return a == inf ? op(1., inf) : op(a, inf); + } + else { + return op(1., 1.); + } + } + else { + return op(a, b); + } + } +}; + +template +struct compare_arg { + compare_arg(const T* arr) + { + this->arr = arr; + } + bool operator()(const int64_t a, const int64_t b) + { + return compare()(arr[a], arr[b]); + } + const T* arr; +}; From 49927b7dc54154593d095797abfc8bb74b0d1352 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 22 Sep 2023 13:41:39 -0700 Subject: [PATCH 08/23] Update CI to run on SKL, SKX and TGL --- .github/workflows/c-cpp.yml | 86 +++---------------------------------- 1 file changed, 7 insertions(+), 79 deletions(-) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 67708ef1..17c3e902 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -7,45 +7,7 @@ on: branches: [ "main" ] jobs: - ICX: - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - - name: Install dependencies - run: | - sudo apt update - sudo apt -y install g++-10 libgtest-dev meson curl git cmake - - - name: Install google benchmarks - run: | - git clone https://github.com/google/benchmark.git - cd benchmark - cmake -E make_directory "build" - cmake -E chdir "build" cmake -DBENCHMARK_ENABLE_GTEST_TESTS=OFF -DBENCHMARK_ENABLE_TESTING=OFF -DCMAKE_BUILD_TYPE=Release ../ - sudo cmake --build "build" --config Release --target install - - - name: Install Intel SDE - run: | - curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/732268/sde-external-9.7.0-2022-05-09-lin.tar.xz - mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ - sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde - - - name: Build - env: - CXX: g++-10 - run: | - make clean - meson setup --warnlevel 2 --werror --buildtype plain builddir - cd builddir - ninja - - - name: Run test suite on ICX - run: sde -icx -- ./builddir/testexe - - SPR: + tests: runs-on: ubuntu-latest @@ -80,45 +42,11 @@ jobs: cd builddir ninja - - name: Run _Float16 test suite on SPR - run: sde -spr -- ./builddir/testexe --gtest_filter="*float16*" - - compare-benchmarks-with-main: - if: ${{ false }} # disable for now - - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - path: x86-simd-sort + - name: Run test suite on SKL + run: sde -skl -- ./builddir/testexe - - name: Specify branch name - working-directory: ${{ github.workspace }}/x86-simd-sort - run: git switch -c pr-branch + - name: Run test suite on SKX + run: sde -skx -- ./builddir/testexe - - uses: actions/setup-python@v4 - with: - python-version: '3.9' - - - name: Install dependencies - run: | - sudo apt update - sudo apt -y install g++-12 libgtest-dev meson curl git cmake - - - name: Install google benchmarks - run: | - git clone https://github.com/google/benchmark.git - cd benchmark - pip3 install -r tools/requirements.txt - cmake -E make_directory "build" - cmake -E chdir "build" cmake -DBENCHMARK_ENABLE_GTEST_TESTS=OFF -DBENCHMARK_ENABLE_TESTING=OFF -DCMAKE_BUILD_TYPE=Release ../ - sudo cmake --build "build" --config Release --target install - - - name: Run bench-compare - working-directory: ${{ github.workspace }}/x86-simd-sort - env: - CXX: g++-12 - GBENCH: ${{ github.workspace }}/benchmark - run: bash -x scripts/branch-compare.sh avx + - name: Run test suite on TGL + run: sde -tgl -- ./builddir/testexe From fd7ba891288adb1ab566acf5c1b1504d10caf917 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 27 Sep 2023 13:00:13 -0700 Subject: [PATCH 09/23] Add tests for _Float16 --- benchmarks/bench-all.cpp | 17 --------------- tests/test-qsort-common.h | 4 ++++ tests/test-qsort.cpp | 8 +++++++ utils/custom-compare.h | 45 +++++++++++++++++++++++++++++++++------ utils/rand_array.h | 10 +++++++++ 5 files changed, 61 insertions(+), 23 deletions(-) diff --git a/benchmarks/bench-all.cpp b/benchmarks/bench-all.cpp index 1df466cb..d445bc63 100644 --- a/benchmarks/bench-all.cpp +++ b/benchmarks/bench-all.cpp @@ -2,23 +2,6 @@ #include "rand_array.h" #include -#ifdef __FLT16_MAX__ -template <> -std::vector<_Float16> get_uniform_rand_array( - int64_t arrsize, - _Float16 max, - _Float16 min) -{ - (void)(max); (void)(min); - std::vector<_Float16> arr; - for (auto jj = 0; jj < arrsize; ++jj) { - _Float16 temp = (float)rand() / (float)(RAND_MAX); - arr.push_back(temp); - } - return arr; -} -#endif - #define MY_BENCHMARK_CAPTURE(func, T, test_case_name, ...) \ BENCHMARK_PRIVATE_DECLARE(func) \ = (::benchmark::internal::RegisterBenchmarkInternal( \ diff --git a/tests/test-qsort-common.h b/tests/test-qsort-common.h index 8a08f61d..0cb71f63 100644 --- a/tests/test-qsort-common.h +++ b/tests/test-qsort-common.h @@ -21,7 +21,11 @@ template void IS_SORTED(std::vector sorted, std::vector arr, std::string type) { +#ifdef __FLT16_MAX__ + if constexpr ((std::is_floating_point_v) || (std::is_same_v)) { +#else if constexpr (std::is_floating_point_v) { +#endif auto cmp_func = compare>(); if (!std::is_sorted(arr.begin(), arr.end(), cmp_func)) { REPORT_FAIL("Array not sorted", arr.size(), type, -1); diff --git a/tests/test-qsort.cpp b/tests/test-qsort.cpp index 6cc0553f..3e30bf7d 100644 --- a/tests/test-qsort.cpp +++ b/tests/test-qsort.cpp @@ -119,7 +119,11 @@ TYPED_TEST_P(simdsort, test_partial_qsort) TYPED_TEST_P(simdsort, test_comparator) { +#ifdef __FLT16_MAX__ + if constexpr ((std::is_floating_point_v) || (std::is_same_v)) { +#else if constexpr (std::is_floating_point_v) { +#endif auto less = compare>(); auto leq = compare>(); auto greater = compare>(); @@ -160,6 +164,10 @@ REGISTER_TYPED_TEST_SUITE_P(simdsort, using QSortTestTypes = testing::Types= 13 + _Float16, +#endif float, double, uint32_t, diff --git a/utils/custom-compare.h b/utils/custom-compare.h index 21c414ef..d163d7ae 100644 --- a/utils/custom-compare.h +++ b/utils/custom-compare.h @@ -1,5 +1,33 @@ #include #include + +typedef union { + _Float16 f_; + uint16_t i_; +} Fp16Bits; + +template +bool isnan(T elem) +{ + return std::isnan(elem); +} + +#ifdef __FLT16_MAX__ +template <> +bool isnan<_Float16>(_Float16 elem) +{ + Fp16Bits temp; + temp.f_ = elem; + return (temp.i_ & 0x7c00) == 0x7c00; +} +#endif + +template +bool isunordered(T a, T b) +{ + return !isnan(a + b); +} + /* * Custom comparator class to handle NAN's: treats NAN > INF */ @@ -8,17 +36,22 @@ struct compare { static constexpr auto op = Comparator {}; bool operator()(const T a, const T b) { +#ifdef __FLT16_MAX__ + if constexpr ((std::is_floating_point_v) || (std::is_same_v)) { +#else if constexpr (std::is_floating_point_v) { +#endif T inf = std::numeric_limits::infinity(); - if (!std::isunordered(a, b)) { return op(a, b); } - else if ((std::isnan(a)) && (!std::isnan(b))) { - return b == inf ? op(inf, 1.) : op(inf, b); + T one = (T) 1.0; + if (!isunordered(a, b)) { return op(a, b); } + else if ((isnan(a)) && (!isnan(b))) { + return b == inf ? op(inf, one) : op(inf, b); } - else if ((!std::isnan(a)) && (std::isnan(b))) { - return a == inf ? op(1., inf) : op(a, inf); + else if ((!isnan(a)) && (isnan(b))) { + return a == inf ? op(one, inf) : op(a, inf); } else { - return op(1., 1.); + return op(one, one); } } else { diff --git a/utils/rand_array.h b/utils/rand_array.h index 0400300a..077f1c45 100644 --- a/utils/rand_array.h +++ b/utils/rand_array.h @@ -27,6 +27,16 @@ static std::vector get_uniform_rand_array( arr.emplace_back(dis(gen)); } } +#ifdef __FLT16_MAX__ + else if constexpr(std::is_same_v) { + (void)(max); (void)(min); + std::vector<_Float16> arr; + for (auto jj = 0; jj < arrsize; ++jj) { + float temp = (float)rand() / (float)(RAND_MAX); + arr.push_back((_Float16)temp); + } + } +#endif else if constexpr(std::is_integral_v) { std::default_random_engine e1(rd()); e1.seed(42); From 3ce31ccacf4a930c969a1e1f8e2d7e260e792a57 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 27 Sep 2023 13:00:58 -0700 Subject: [PATCH 10/23] Use g++-13 in CI build and test --- .github/workflows/c-cpp.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 17c3e902..84a49c82 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -17,7 +17,7 @@ jobs: - name: Install dependencies run: | sudo apt update - sudo apt -y install g++-12 libgtest-dev meson curl git cmake + sudo apt -y install g++-13 libgtest-dev meson curl git cmake - name: Install google benchmarks run: | @@ -35,7 +35,7 @@ jobs: - name: Build env: - CXX: g++-12 + CXX: g++-13 run: | make clean meson setup --warnlevel 2 --werror --buildtype plain builddir From 3b2e6a921035fc025e5729473ba5ee0afb541e47 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 27 Sep 2023 21:09:21 -0700 Subject: [PATCH 11/23] Fix broken tests for _Float16 --- tests/test-qsort-common.h | 6 +-- tests/test-qsort.cpp | 10 ++--- utils/custom-compare.h | 42 +++--------------- utils/custom-float.h | 90 +++++++++++++++++++++++++++++++++++++++ utils/rand_array.h | 26 +++++------ 5 files changed, 113 insertions(+), 61 deletions(-) create mode 100644 utils/custom-float.h diff --git a/tests/test-qsort-common.h b/tests/test-qsort-common.h index 0cb71f63..b2daeb21 100644 --- a/tests/test-qsort-common.h +++ b/tests/test-qsort-common.h @@ -21,11 +21,7 @@ template void IS_SORTED(std::vector sorted, std::vector arr, std::string type) { -#ifdef __FLT16_MAX__ - if constexpr ((std::is_floating_point_v) || (std::is_same_v)) { -#else - if constexpr (std::is_floating_point_v) { -#endif + if constexpr (xss::fp::is_floating_point_v) { auto cmp_func = compare>(); if (!std::is_sorted(arr.begin(), arr.end(), cmp_func)) { REPORT_FAIL("Array not sorted", arr.size(), type, -1); diff --git a/tests/test-qsort.cpp b/tests/test-qsort.cpp index 3e30bf7d..40098a46 100644 --- a/tests/test-qsort.cpp +++ b/tests/test-qsort.cpp @@ -119,18 +119,14 @@ TYPED_TEST_P(simdsort, test_partial_qsort) TYPED_TEST_P(simdsort, test_comparator) { -#ifdef __FLT16_MAX__ - if constexpr ((std::is_floating_point_v) || (std::is_same_v)) { -#else - if constexpr (std::is_floating_point_v) { -#endif + if constexpr (xss::fp::is_floating_point_v) { auto less = compare>(); auto leq = compare>(); auto greater = compare>(); auto geq = compare>(); auto equal = compare>(); - TypeParam nan = std::numeric_limits::quiet_NaN(); - TypeParam inf = std::numeric_limits::infinity(); + TypeParam nan = xss::fp::quiet_NaN(); + TypeParam inf = xss::fp::infinity(); ASSERT_EQ(less(nan, inf), false); ASSERT_EQ(less(nan, nan), false); ASSERT_EQ(less(inf, nan), true); diff --git a/utils/custom-compare.h b/utils/custom-compare.h index d163d7ae..d99f0491 100644 --- a/utils/custom-compare.h +++ b/utils/custom-compare.h @@ -1,32 +1,6 @@ #include #include - -typedef union { - _Float16 f_; - uint16_t i_; -} Fp16Bits; - -template -bool isnan(T elem) -{ - return std::isnan(elem); -} - -#ifdef __FLT16_MAX__ -template <> -bool isnan<_Float16>(_Float16 elem) -{ - Fp16Bits temp; - temp.f_ = elem; - return (temp.i_ & 0x7c00) == 0x7c00; -} -#endif - -template -bool isunordered(T a, T b) -{ - return !isnan(a + b); -} +#include "custom-float.h" /* * Custom comparator class to handle NAN's: treats NAN > INF @@ -36,18 +10,14 @@ struct compare { static constexpr auto op = Comparator {}; bool operator()(const T a, const T b) { -#ifdef __FLT16_MAX__ - if constexpr ((std::is_floating_point_v) || (std::is_same_v)) { -#else - if constexpr (std::is_floating_point_v) { -#endif - T inf = std::numeric_limits::infinity(); + if constexpr (xss::fp::is_floating_point_v) { + T inf = xss::fp::infinity(); T one = (T) 1.0; - if (!isunordered(a, b)) { return op(a, b); } - else if ((isnan(a)) && (!isnan(b))) { + if (!xss::fp::isunordered(a, b)) { return op(a, b); } + else if ((xss::fp::isnan(a)) && (!xss::fp::isnan(b))) { return b == inf ? op(inf, one) : op(inf, b); } - else if ((!isnan(a)) && (isnan(b))) { + else if ((!xss::fp::isnan(a)) && (xss::fp::isnan(b))) { return a == inf ? op(one, inf) : op(a, inf); } else { diff --git a/utils/custom-float.h b/utils/custom-float.h new file mode 100644 index 00000000..7c823f0c --- /dev/null +++ b/utils/custom-float.h @@ -0,0 +1,90 @@ +#ifndef UTILS_FLOAT +#define UTILS_FLOAT +namespace xss { +namespace fp +{ + template + inline constexpr bool is_floating_point_v = std::is_floating_point_v; + + template + bool isnan(T elem) + { + return std::isnan(elem); + } + template + bool isunordered(T a, T b) + { + return std::isunordered(a, b); + } + template + T max() + { + return std::numeric_limits::max(); + } + template + T min() + { + return std::numeric_limits::min(); + } + template + T infinity() + { + return std::numeric_limits::infinity(); + } + template + T quiet_NaN() + { + return std::numeric_limits::quiet_NaN(); + } + +#ifdef __FLT16_MAX__ + typedef union { + _Float16 f_; + uint16_t i_; + } Fp16Bits; + + _Float16 convert_bits(uint16_t val) + { + Fp16Bits temp; + temp.i_ = val; + return temp.f_; + } + + template <> + inline constexpr bool is_floating_point_v<_Float16> = true; + + template <> + bool isnan<_Float16>(_Float16 elem) + { + return elem != elem; + } + template <> + bool isunordered<_Float16>(_Float16 a, _Float16 b) + { + return isnan(a) || isnan(b); + } + template <> + _Float16 max<_Float16>() + { + return convert_bits(0x7bff); + } + template <> + _Float16 min<_Float16>() + { + return convert_bits(0x0400); + } + template <> + _Float16 infinity<_Float16>() + { + return convert_bits(0x7c00); + } + template <> + _Float16 quiet_NaN<_Float16>() + { + return convert_bits(0x7c01); + } +#endif + +} // namespace float +} // namespace xss +#endif diff --git a/utils/rand_array.h b/utils/rand_array.h index 077f1c45..562c67bf 100644 --- a/utils/rand_array.h +++ b/utils/rand_array.h @@ -10,12 +10,13 @@ #include #include #include +#include "custom-float.h" template static std::vector get_uniform_rand_array( int64_t arrsize, - T max = std::numeric_limits::max(), - T min = std::numeric_limits::min()) + T max = xss::fp::max(), + T min = xss::fp::min()) { std::vector arr; std::random_device rd; @@ -30,7 +31,6 @@ static std::vector get_uniform_rand_array( #ifdef __FLT16_MAX__ else if constexpr(std::is_same_v) { (void)(max); (void)(min); - std::vector<_Float16> arr; for (auto jj = 0; jj < arrsize; ++jj) { float temp = (float)rand() / (float)(RAND_MAX); arr.push_back((_Float16)temp); @@ -51,8 +51,8 @@ static std::vector get_uniform_rand_array( template static std::vector get_uniform_rand_array_with_uniquevalues(int64_t arrsize, - T max = std::numeric_limits::max(), - T min = std::numeric_limits::min()) + T max = xss::fp::max(), + T min = xss::fp::min()) { std::vector arr = get_uniform_rand_array(arrsize, max, min); typename std::vector::iterator ip @@ -65,8 +65,8 @@ template static std::vector get_array(std::string arrtype, int64_t arrsize, - T min = std::numeric_limits::min(), - T max = std::numeric_limits::max()) + T min = xss::fp::min(), + T max = xss::fp::max()) { std::vector arr; if (arrtype == "random") { arr = get_uniform_rand_array(arrsize, max, min); } @@ -90,8 +90,8 @@ get_array(std::string arrtype, } else if (arrtype == "max_at_the_end") { arr = get_uniform_rand_array(arrsize, max, min); - if (std::numeric_limits::has_infinity) { - arr[arrsize - 1] = std::numeric_limits::infinity(); + if (xss::fp::is_floating_point_v) { + arr[arrsize - 1] = xss::fp::infinity(); } else { arr[arrsize - 1] = std::numeric_limits::max(); @@ -103,8 +103,8 @@ get_array(std::string arrtype, std::vector rand_indx = get_uniform_rand_array(num_nans, arrsize-1, 0); T val; - if constexpr (std::is_floating_point_v) { - val = std::numeric_limits::quiet_NaN(); + if constexpr (xss::fp::is_floating_point_v) { + val = xss::fp::quiet_NaN(); } else { val = std::numeric_limits::max(); @@ -116,8 +116,8 @@ get_array(std::string arrtype, else if (arrtype == "rand_max") { arr = get_uniform_rand_array(arrsize, max, min); T val; - if constexpr (std::is_floating_point_v) { - val = std::numeric_limits::infinity(); + if constexpr (xss::fp::is_floating_point_v) { + val = xss::fp::infinity(); } else { val = std::numeric_limits::max(); From b1669e72a72b7f5498256b35ac3e9e321c53b6e4 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Thu, 28 Sep 2023 09:54:56 -0700 Subject: [PATCH 12/23] Use memcmp to test qsort --- tests/test-qsort-common.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/test-qsort-common.h b/tests/test-qsort-common.h index b2daeb21..f4f15f3d 100644 --- a/tests/test-qsort-common.h +++ b/tests/test-qsort-common.h @@ -21,16 +21,8 @@ template void IS_SORTED(std::vector sorted, std::vector arr, std::string type) { - if constexpr (xss::fp::is_floating_point_v) { - auto cmp_func = compare>(); - if (!std::is_sorted(arr.begin(), arr.end(), cmp_func)) { - REPORT_FAIL("Array not sorted", arr.size(), type, -1); - } - } - else { - if (memcmp(arr.data(), sorted.data(), arr.size() * sizeof(T) != 0)) { - REPORT_FAIL("Array not sorted", arr.size(), type, -1); - } + if (memcmp(arr.data(), sorted.data(), arr.size() * sizeof(T) != 0)) { + REPORT_FAIL("Array not sorted", arr.size(), type, -1); } } From 4bce094113970413f071cd67a5b8d75dc727332e Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Thu, 28 Sep 2023 15:20:06 -0700 Subject: [PATCH 13/23] Add static keyword where applicable --- src/avx512-16bit-qsort.hpp | 36 +++-- src/avx512-32bit-qsort.hpp | 4 +- src/avx512-64bit-argsort.hpp | 102 +++++++------ src/avx512-64bit-common.h | 16 +-- src/avx512-64bit-keyvaluesort.hpp | 43 +++--- src/avx512-common-argsort.h | 66 ++++----- src/avx512-common-qsort.h | 231 +++++++++++++++--------------- src/avx512fp16-16bit-qsort.hpp | 8 +- src/xss-network-qsort.hpp | 58 ++++---- 9 files changed, 296 insertions(+), 268 deletions(-) diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp index 13b732d0..a204310d 100644 --- a/src/avx512-16bit-qsort.hpp +++ b/src/avx512-16bit-qsort.hpp @@ -423,10 +423,10 @@ bool comparison_func>(const uint16_t &a, const uint16_t &b) } template <> -int64_t replace_nan_with_inf>(uint16_t *arr, - int64_t arrsize) +arrsize_t replace_nan_with_inf>(uint16_t *arr, + arrsize_t arrsize) { - int64_t nan_count = 0; + arrsize_t nan_count = 0; __mmask16 loadmask = 0xFFFF; while (arrsize > 0) { if (arrsize < 16) { loadmask = (0x0001 << arrsize) - 0x0001; } @@ -448,26 +448,42 @@ bool is_a_nan(uint16_t elem) return (elem & 0x7c00) == 0x7c00; } -void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize) +X86_SIMD_SORT_INLINE +void avx512_qsort_fp16(uint16_t *arr, arrsize_t arrsize) { if (arrsize > 1) { - int64_t nan_count = replace_nan_with_inf, uint16_t>( - arr, arrsize); + arrsize_t nan_count + = replace_nan_with_inf, uint16_t>(arr, + arrsize); qsort_, uint16_t>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); replace_inf_with_nan(arr, arrsize, nan_count); } } -void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize, bool hasnan) +X86_SIMD_SORT_INLINE +void avx512_qselect_fp16(uint16_t *arr, + arrsize_t k, + arrsize_t arrsize, + bool hasnan = true) { - int64_t indx_last_elem = arrsize - 1; + arrsize_t indx_last_elem = arrsize - 1; if (UNLIKELY(hasnan)) { indx_last_elem = move_nans_to_end_of_array(arr, arrsize); } if (indx_last_elem >= k) { qselect_, uint16_t>( - arr, k, 0, indx_last_elem, 2 * (int64_t)log2(indx_last_elem)); + arr, k, 0, indx_last_elem, 2 * (arrsize_t)log2(indx_last_elem)); } } + +X86_SIMD_SORT_INLINE +void avx512_partial_qsort_fp16(uint16_t *arr, + arrsize_t k, + arrsize_t arrsize, + bool hasnan = false) +{ + avx512_qselect_fp16(arr, k - 1, arrsize, hasnan); + avx512_qsort_fp16(arr, k - 1); +} #endif // AVX512_QSORT_16BIT diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp index fd427c28..1faeb003 100644 --- a/src/avx512-32bit-qsort.hpp +++ b/src/avx512-32bit-qsort.hpp @@ -25,10 +25,10 @@ #define NETWORK_32BIT_7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 template -X86_SIMD_SORT_INLINE reg_t sort_zmm_32bit(reg_t zmm); +reg_t sort_zmm_32bit(reg_t zmm); template -X86_SIMD_SORT_INLINE reg_t bitonic_merge_zmm_32bit(reg_t zmm); +reg_t bitonic_merge_zmm_32bit(reg_t zmm); template <> struct zmm_vector { diff --git a/src/avx512-64bit-argsort.hpp b/src/avx512-64bit-argsort.hpp index c9c5e961..1a20c7c0 100644 --- a/src/avx512-64bit-argsort.hpp +++ b/src/avx512-64bit-argsort.hpp @@ -12,13 +12,13 @@ #include "avx512-common-argsort.h" template -void std_argselect_withnan( - T *arr, int64_t *arg, int64_t k, int64_t left, int64_t right) +X86_SIMD_SORT_INLINE void std_argselect_withnan( + T *arr, arrsize_t *arg, arrsize_t k, arrsize_t left, arrsize_t right) { std::nth_element(arg + left, arg + k, arg + right, - [arr](int64_t a, int64_t b) -> bool { + [arr](arrsize_t a, arrsize_t b) -> bool { if ((!std::isnan(arr[a])) && (!std::isnan(arr[b]))) { return arr[a] < arr[b]; } @@ -33,11 +33,12 @@ void std_argselect_withnan( /* argsort using std::sort */ template -void std_argsort_withnan(T *arr, int64_t *arg, int64_t left, int64_t right) +X86_SIMD_SORT_INLINE void +std_argsort_withnan(T *arr, arrsize_t *arg, arrsize_t left, arrsize_t right) { std::sort(arg + left, arg + right, - [arr](int64_t left, int64_t right) -> bool { + [arr](arrsize_t left, arrsize_t right) -> bool { if ((!std::isnan(arr[left])) && (!std::isnan(arr[right]))) { return arr[left] < arr[right]; } @@ -52,18 +53,20 @@ void std_argsort_withnan(T *arr, int64_t *arg, int64_t left, int64_t right) /* argsort using std::sort */ template -void std_argsort(T *arr, int64_t *arg, int64_t left, int64_t right) +X86_SIMD_SORT_INLINE void +std_argsort(T *arr, arrsize_t *arg, arrsize_t left, arrsize_t right) { std::sort(arg + left, arg + right, - [arr](int64_t left, int64_t right) -> bool { + [arr](arrsize_t left, arrsize_t right) -> bool { // sort indices according to corresponding array element return arr[left] < arr[right]; }); } template -X86_SIMD_SORT_INLINE void argsort_8_64bit(type_t *arr, int64_t *arg, int32_t N) +X86_SIMD_SORT_INLINE void +argsort_8_64bit(type_t *arr, arrsize_t *arg, int32_t N) { using reg_t = typename vtype::reg_t; typename vtype::opmask_t load_mask = (0x01 << N) - 0x01; @@ -75,7 +78,8 @@ X86_SIMD_SORT_INLINE void argsort_8_64bit(type_t *arr, int64_t *arg, int32_t N) } template -X86_SIMD_SORT_INLINE void argsort_16_64bit(type_t *arr, int64_t *arg, int32_t N) +X86_SIMD_SORT_INLINE void +argsort_16_64bit(type_t *arr, arrsize_t *arg, int32_t N) { if (N <= 8) { argsort_8_64bit(arr, arg, N); @@ -97,7 +101,8 @@ X86_SIMD_SORT_INLINE void argsort_16_64bit(type_t *arr, int64_t *arg, int32_t N) } template -X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N) +X86_SIMD_SORT_INLINE void +argsort_32_64bit(type_t *arr, arrsize_t *arg, int32_t N) { if (N <= 16) { argsort_16_64bit(arr, arg, N); @@ -108,7 +113,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N) reg_t arrzmm[4]; argreg_t argzmm[4]; -X86_SIMD_SORT_UNROLL_LOOP(2) + X86_SIMD_SORT_UNROLL_LOOP(2) for (int ii = 0; ii < 2; ++ii) { argzmm[ii] = argtype::loadu(arg + 8 * ii); arrzmm[ii] = vtype::i64gather(arr, arg + 8 * ii); @@ -117,7 +122,7 @@ X86_SIMD_SORT_UNROLL_LOOP(2) uint64_t combined_mask = (0x1ull << (N - 16)) - 0x1ull; opmask_t load_mask[2] = {0xFF, 0xFF}; -X86_SIMD_SORT_UNROLL_LOOP(2) + X86_SIMD_SORT_UNROLL_LOOP(2) for (int ii = 0; ii < 2; ++ii) { load_mask[ii] = (combined_mask >> (ii * 8)) & 0xFF; argzmm[ii + 2] = argtype::maskz_loadu(load_mask[ii], arg + 16 + 8 * ii); @@ -140,7 +145,8 @@ X86_SIMD_SORT_UNROLL_LOOP(2) } template -X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) +X86_SIMD_SORT_INLINE void +argsort_64_64bit(type_t *arr, arrsize_t *arg, int32_t N) { if (N <= 32) { argsort_32_64bit(arr, arg, N); @@ -151,7 +157,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) reg_t arrzmm[8]; argreg_t argzmm[8]; -X86_SIMD_SORT_UNROLL_LOOP(4) + X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 4; ++ii) { argzmm[ii] = argtype::loadu(arg + 8 * ii); arrzmm[ii] = vtype::i64gather(arr, arg + 8 * ii); @@ -160,7 +166,7 @@ X86_SIMD_SORT_UNROLL_LOOP(4) opmask_t load_mask[4] = {0xFF, 0xFF, 0xFF, 0xFF}; uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull; -X86_SIMD_SORT_UNROLL_LOOP(4) + X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 4; ++ii) { load_mask[ii] = (combined_mask >> (ii * 8)) & 0xFF; argzmm[ii + 4] = argtype::maskz_loadu(load_mask[ii], arg + 32 + 8 * ii); @@ -170,7 +176,7 @@ X86_SIMD_SORT_UNROLL_LOOP(4) argzmm[ii + 4]); } -X86_SIMD_SORT_UNROLL_LOOP(4) + X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 8; ii = ii + 2) { bitonic_merge_two_zmm_64bit( arrzmm[ii], arrzmm[ii + 1], argzmm[ii], argzmm[ii + 1]); @@ -179,11 +185,11 @@ X86_SIMD_SORT_UNROLL_LOOP(4) bitonic_merge_four_zmm_64bit(arrzmm + 4, argzmm + 4); bitonic_merge_eight_zmm_64bit(arrzmm, argzmm); -X86_SIMD_SORT_UNROLL_LOOP(4) + X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 4; ++ii) { argtype::storeu(arg + 8 * ii, argzmm[ii]); } -X86_SIMD_SORT_UNROLL_LOOP(4) + X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 4; ++ii) { argtype::mask_storeu(arg + 32 + 8 * ii, load_mask[ii], argzmm[ii + 4]); } @@ -192,7 +198,7 @@ X86_SIMD_SORT_UNROLL_LOOP(4) /* arsort 128 doesn't seem to make much of a difference to perf*/ //template //X86_SIMD_SORT_INLINE void -//argsort_128_64bit(type_t *arr, int64_t *arg, int32_t N) +//argsort_128_64bit(type_t *arr, arrsize_t *arg, int32_t N) //{ // if (N <= 64) { // argsort_64_64bit(arr, arg, N); @@ -212,7 +218,7 @@ X86_SIMD_SORT_UNROLL_LOOP(4) // // opmask_t load_mask[8] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; // if (N != 128) { -// uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull; +// uarrsize_t combined_mask = (0x1ull << (N - 64)) - 0x1ull; //X86_SIMD_SORT_UNROLL_LOOP(8) // for (int ii = 0; ii < 8; ++ii) { // load_mask[ii] = (combined_mask >> (ii*8)) & 0xFF; @@ -248,14 +254,14 @@ X86_SIMD_SORT_UNROLL_LOOP(4) //} template -type_t get_pivot_64bit(type_t *arr, - int64_t *arg, - const int64_t left, - const int64_t right) +X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr, + arrsize_t *arg, + const arrsize_t left, + const arrsize_t right) { if (right - left >= vtype::numlanes) { // median of 8 - int64_t size = (right - left) / 8; + arrsize_t size = (right - left) / 8; using reg_t = typename vtype::reg_t; reg_t rand_vec = vtype::set(arr[arg[left + size]], arr[arg[left + 2 * size]], @@ -275,11 +281,11 @@ type_t get_pivot_64bit(type_t *arr, } template -inline void argsort_64bit_(type_t *arr, - int64_t *arg, - int64_t left, - int64_t right, - int64_t max_iters) +X86_SIMD_SORT_INLINE void argsort_64bit_(type_t *arr, + arrsize_t *arg, + arrsize_t left, + arrsize_t right, + arrsize_t max_iters) { /* * Resort to std::sort if quicksort isnt making any progress @@ -298,7 +304,7 @@ inline void argsort_64bit_(type_t *arr, type_t pivot = get_pivot_64bit(arr, arg, left, right); type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); - int64_t pivot_index = partition_avx512_unrolled( + arrsize_t pivot_index = partition_avx512_unrolled( arr, arg, left, right + 1, pivot, &smallest, &biggest); if (pivot != smallest) argsort_64bit_(arr, arg, left, pivot_index - 1, max_iters - 1); @@ -307,12 +313,12 @@ inline void argsort_64bit_(type_t *arr, } template -static void argselect_64bit_(type_t *arr, - int64_t *arg, - int64_t pos, - int64_t left, - int64_t right, - int64_t max_iters) +X86_SIMD_SORT_INLINE void argselect_64bit_(type_t *arr, + arrsize_t *arg, + arrsize_t pos, + arrsize_t left, + arrsize_t right, + arrsize_t max_iters) { /* * Resort to std::sort if quicksort isnt making any progress @@ -331,7 +337,7 @@ static void argselect_64bit_(type_t *arr, type_t pivot = get_pivot_64bit(arr, arg, left, right); type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); - int64_t pivot_index = partition_avx512_unrolled( + arrsize_t pivot_index = partition_avx512_unrolled( arr, arg, left, right + 1, pivot, &smallest, &biggest); if ((pivot != smallest) && (pos < pivot_index)) argselect_64bit_( @@ -343,7 +349,8 @@ static void argselect_64bit_(type_t *arr, /* argsort methods for 32-bit and 64-bit dtypes */ template -void avx512_argsort(T *arr, int64_t *arg, int64_t arrsize) +X86_SIMD_SORT_INLINE void +avx512_argsort(T *arr, arrsize_t *arg, arrsize_t arrsize) { using vectype = typename std::conditional, @@ -356,14 +363,15 @@ void avx512_argsort(T *arr, int64_t *arg, int64_t arrsize) } } argsort_64bit_( - arr, arg, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + arr, arg, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); } } template -std::vector avx512_argsort(T *arr, int64_t arrsize) +X86_SIMD_SORT_INLINE std::vector avx512_argsort(T *arr, + arrsize_t arrsize) { - std::vector indices(arrsize); + std::vector indices(arrsize); std::iota(indices.begin(), indices.end(), 0); avx512_argsort(arr, indices.data(), arrsize); return indices; @@ -371,7 +379,8 @@ std::vector avx512_argsort(T *arr, int64_t arrsize) /* argselect methods for 32-bit and 64-bit dtypes */ template -void avx512_argselect(T *arr, int64_t *arg, int64_t k, int64_t arrsize) +X86_SIMD_SORT_INLINE void +avx512_argselect(T *arr, arrsize_t *arg, arrsize_t k, arrsize_t arrsize) { using vectype = typename std::conditional, @@ -385,14 +394,15 @@ void avx512_argselect(T *arr, int64_t *arg, int64_t k, int64_t arrsize) } } argselect_64bit_( - arr, arg, k, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + arr, arg, k, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); } } template -std::vector avx512_argselect(T *arr, int64_t k, int64_t arrsize) +X86_SIMD_SORT_INLINE std::vector +avx512_argselect(T *arr, arrsize_t k, arrsize_t arrsize) { - std::vector indices(arrsize); + std::vector indices(arrsize); std::iota(indices.begin(), indices.end(), 0); avx512_argselect(arr, indices.data(), k, arrsize); return indices; diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index 3227e071..13713638 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -20,10 +20,10 @@ #define NETWORK_64BIT_4 3, 2, 1, 0, 7, 6, 5, 4 template -X86_SIMD_SORT_INLINE reg_t sort_zmm_64bit(reg_t zmm); +reg_t sort_zmm_64bit(reg_t zmm); template -X86_SIMD_SORT_INLINE reg_t bitonic_merge_zmm_64bit(reg_t zmm); +reg_t bitonic_merge_zmm_64bit(reg_t zmm); template <> struct ymm_vector { @@ -96,7 +96,7 @@ struct ymm_vector { { return _mm512_mask_i64gather_ps(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, int64_t *ind) + static reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], @@ -248,7 +248,7 @@ struct ymm_vector { { return _mm512_mask_i64gather_epi32(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, int64_t *ind) + static reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], @@ -394,7 +394,7 @@ struct ymm_vector { { return _mm512_mask_i64gather_epi32(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, int64_t *ind) + static reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], @@ -543,7 +543,7 @@ struct zmm_vector { { return _mm512_mask_i64gather_epi64(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, int64_t *ind) + static reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], @@ -673,7 +673,7 @@ struct zmm_vector { { return _mm512_mask_i64gather_epi64(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, int64_t *ind) + static reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], @@ -835,7 +835,7 @@ struct zmm_vector { { return _mm512_mask_i64gather_pd(src, mask, index, base, scale); } - static reg_t i64gather(type_t *arr, int64_t *ind) + static reg_t i64gather(type_t *arr, arrsize_t *ind) { return set(arr[ind[7]], arr[ind[6]], diff --git a/src/avx512-64bit-keyvaluesort.hpp b/src/avx512-64bit-keyvaluesort.hpp index 05a69c87..43c89763 100644 --- a/src/avx512-64bit-keyvaluesort.hpp +++ b/src/avx512-64bit-keyvaluesort.hpp @@ -365,11 +365,12 @@ template -void heapify(type1_t *keys, type2_t *indexes, int64_t idx, int64_t size) +X86_SIMD_SORT_INLINE void +heapify(type1_t *keys, type2_t *indexes, arrsize_t idx, arrsize_t size) { - int64_t i = idx; + arrsize_t i = idx; while (true) { - int64_t j = 2 * i + 1; + arrsize_t j = 2 * i + 1; if (j >= size || j < 0) { break; } int k = j + 1; if (k < size && keys[j] < keys[k]) { j = k; } @@ -383,12 +384,13 @@ template -void heap_sort(type1_t *keys, type2_t *indexes, int64_t size) +X86_SIMD_SORT_INLINE void +heap_sort(type1_t *keys, type2_t *indexes, arrsize_t size) { - for (int64_t i = size / 2 - 1; i >= 0; i--) { + for (arrsize_t i = size / 2 - 1; i >= 0; i--) { heapify(keys, indexes, i, size); } - for (int64_t i = size - 1; i > 0; i--) { + for (arrsize_t i = size - 1; i > 0; i--) { std::swap(keys[0], keys[i]); std::swap(indexes[0], indexes[i]); heapify(keys, indexes, 0, i); @@ -399,11 +401,11 @@ template -void qsort_64bit_(type1_t *keys, - type2_t *indexes, - int64_t left, - int64_t right, - int64_t max_iters) +X86_SIMD_SORT_INLINE void qsort_64bit_(type1_t *keys, + type2_t *indexes, + arrsize_t left, + arrsize_t right, + arrsize_t max_iters) { /* * Resort to std::sort if quicksort isnt making any progress @@ -427,7 +429,7 @@ void qsort_64bit_(type1_t *keys, type1_t pivot = get_pivot(keys, left, right); type1_t smallest = vtype1::type_max(); type1_t biggest = vtype1::type_min(); - int64_t pivot_index = partition_avx512( + arrsize_t pivot_index = partition_avx512( keys, indexes, left, right + 1, pivot, &smallest, &biggest); if (pivot != smallest) { qsort_64bit_( @@ -440,19 +442,28 @@ void qsort_64bit_(type1_t *keys, } template -void avx512_qsort_kv(T1 *keys, T2 *indexes, int64_t arrsize) +X86_SIMD_SORT_INLINE void +avx512_qsort_kv(T1 *keys, T2 *indexes, arrsize_t arrsize) { if (arrsize > 1) { if constexpr (std::is_floating_point_v) { - int64_t nan_count + arrsize_t nan_count = replace_nan_with_inf>(keys, arrsize); qsort_64bit_, zmm_vector>( - keys, indexes, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + keys, + indexes, + 0, + arrsize - 1, + 2 * (arrsize_t)log2(arrsize)); replace_inf_with_nan(keys, arrsize, nan_count); } else { qsort_64bit_, zmm_vector>( - keys, indexes, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + keys, + indexes, + 0, + arrsize - 1, + 2 * (arrsize_t)log2(arrsize)); } } } diff --git a/src/avx512-common-argsort.h b/src/avx512-common-argsort.h index 375afc0b..357d143c 100644 --- a/src/avx512-common-argsort.h +++ b/src/avx512-common-argsort.h @@ -12,7 +12,7 @@ #include #include -using argtype = zmm_vector; +using argtype = zmm_vector; using argreg_t = typename argtype::reg_t; /* @@ -20,14 +20,14 @@ using argreg_t = typename argtype::reg_t; * last element that is less than equal to the pivot. */ template -static inline int32_t partition_vec(type_t *arg, - int64_t left, - int64_t right, - const argreg_t arg_vec, - const reg_t curr_vec, - const reg_t pivot_vec, - reg_t *smallest_vec, - reg_t *biggest_vec) +X86_SIMD_SORT_INLINE int32_t partition_vec(type_t *arg, + arrsize_t left, + arrsize_t right, + const argreg_t arg_vec, + const reg_t curr_vec, + const reg_t pivot_vec, + reg_t *smallest_vec, + reg_t *biggest_vec) { /* which elements are larger than the pivot */ typename vtype::opmask_t gt_mask = vtype::ge(curr_vec, pivot_vec); @@ -45,13 +45,13 @@ static inline int32_t partition_vec(type_t *arg, * last element that is less than equal to the pivot. */ template -static inline int64_t partition_avx512(type_t *arr, - int64_t *arg, - int64_t left, - int64_t right, - type_t pivot, - type_t *smallest, - type_t *biggest) +X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr, + arrsize_t *arg, + arrsize_t left, + arrsize_t right, + type_t pivot, + type_t *smallest, + type_t *biggest) { /* make array length divisible by vtype::numlanes , shortening the array */ for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) { @@ -95,8 +95,8 @@ static inline int64_t partition_avx512(type_t *arr, argreg_t argvec_right = argtype::loadu(arg + (right - vtype::numlanes)); reg_t vec_right = vtype::i64gather(arr, arg + (right - vtype::numlanes)); // store points of the vectors - int64_t r_store = right - vtype::numlanes; - int64_t l_store = left; + arrsize_t r_store = right - vtype::numlanes; + arrsize_t l_store = left; // indices for loading the elements left += vtype::numlanes; right -= vtype::numlanes; @@ -160,13 +160,13 @@ static inline int64_t partition_avx512(type_t *arr, template -static inline int64_t partition_avx512_unrolled(type_t *arr, - int64_t *arg, - int64_t left, - int64_t right, - type_t pivot, - type_t *smallest, - type_t *biggest) +X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr, + arrsize_t *arg, + arrsize_t left, + arrsize_t right, + type_t pivot, + type_t *smallest, + type_t *biggest) { if (right - left <= 8 * num_unroll * vtype::numlanes) { return partition_avx512( @@ -196,7 +196,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, // first and last vtype::numlanes values are partitioned at the end reg_t vec_left[num_unroll], vec_right[num_unroll]; argreg_t argvec_left[num_unroll], argvec_right[num_unroll]; -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { argvec_left[ii] = argtype::loadu(arg + left + vtype::numlanes * ii); vec_left[ii] = vtype::i64gather(arr, arg + left + vtype::numlanes * ii); @@ -206,8 +206,8 @@ X86_SIMD_SORT_UNROLL_LOOP(8) arr, arg + (right - vtype::numlanes * (num_unroll - ii))); } // store points of the vectors - int64_t r_store = right - vtype::numlanes; - int64_t l_store = left; + arrsize_t r_store = right - vtype::numlanes; + arrsize_t l_store = left; // indices for loading the elements left += num_unroll * vtype::numlanes; right -= num_unroll * vtype::numlanes; @@ -221,7 +221,7 @@ X86_SIMD_SORT_UNROLL_LOOP(8) */ if ((r_store + vtype::numlanes) - right < left - l_store) { right -= num_unroll * vtype::numlanes; -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { arg_vec[ii] = argtype::loadu(arg + right + ii * vtype::numlanes); @@ -230,7 +230,7 @@ X86_SIMD_SORT_UNROLL_LOOP(8) } } else { -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { arg_vec[ii] = argtype::loadu(arg + left + ii * vtype::numlanes); curr_vec[ii] = vtype::i64gather( @@ -239,7 +239,7 @@ X86_SIMD_SORT_UNROLL_LOOP(8) left += num_unroll * vtype::numlanes; } // partition the current vector and save it on both sides of the array -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_gt_pivot = partition_vec(arg, @@ -256,7 +256,7 @@ X86_SIMD_SORT_UNROLL_LOOP(8) } /* partition and save vec_left and vec_right */ -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_gt_pivot = partition_vec(arg, @@ -270,7 +270,7 @@ X86_SIMD_SORT_UNROLL_LOOP(8) l_store += (vtype::numlanes - amount_gt_pivot); r_store -= amount_gt_pivot; } -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_gt_pivot = partition_vec(arg, diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h index 349f51b5..8a313ddc 100644 --- a/src/avx512-common-qsort.h +++ b/src/avx512-common-qsort.h @@ -20,7 +20,7 @@ * separate files: avx512-16bit-qsort.hpp, avx512-32bit-qsort.hpp and * avx512-64bit-qsort.hpp. Article [4] is a good resource for bitonic sorting * network. The core implementations of the vectorized qsort functions - * avx512_qsort(T*, int64_t) are modified versions of avx2 quicksort + * avx512_qsort(T*, arrsize_t) are modified versions of avx2 quicksort * presented in the paper [2] and source code associated with that paper [3]. * * [1] Fast and Robust Vectorized In-Place Sorting of Primitive Types @@ -67,7 +67,7 @@ #define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16) #define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d -#define PRAGMA(x) _Pragma (#x) +#define PRAGMA(x) _Pragma(#x) /* Compiler specific macros specific */ #ifdef _MSC_VER @@ -100,6 +100,8 @@ #define X86_SIMD_SORT_UNROLL_LOOP(num) #endif +typedef int64_t arrsize_t; + template struct zmm_vector; @@ -113,9 +115,9 @@ bool is_a_nan(T elem) } template -int64_t replace_nan_with_inf(T *arr, int64_t arrsize) +X86_SIMD_SORT_INLINE arrsize_t replace_nan_with_inf(T *arr, arrsize_t arrsize) { - int64_t nan_count = 0; + arrsize_t nan_count = 0; using opmask_t = typename vtype::opmask_t; using reg_t = typename vtype::reg_t; opmask_t loadmask; @@ -138,7 +140,7 @@ int64_t replace_nan_with_inf(T *arr, int64_t arrsize) } template -bool has_nan(type_t *arr, int64_t arrsize) +X86_SIMD_SORT_INLINE bool has_nan(type_t *arr, arrsize_t arrsize) { using opmask_t = typename vtype::opmask_t; using reg_t = typename vtype::reg_t; @@ -165,9 +167,10 @@ bool has_nan(type_t *arr, int64_t arrsize) } template -void replace_inf_with_nan(type_t *arr, int64_t arrsize, int64_t nan_count) +X86_SIMD_SORT_INLINE void +replace_inf_with_nan(type_t *arr, arrsize_t arrsize, arrsize_t nan_count) { - for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { + for (arrsize_t ii = arrsize - 1; nan_count > 0; --ii) { if constexpr (std::is_floating_point_v) { arr[ii] = std::numeric_limits::quiet_NaN(); } @@ -183,11 +186,12 @@ void replace_inf_with_nan(type_t *arr, int64_t arrsize, int64_t nan_count) * in the array which is not a nan */ template -int64_t move_nans_to_end_of_array(T *arr, int64_t arrsize) +X86_SIMD_SORT_INLINE arrsize_t move_nans_to_end_of_array(T *arr, + arrsize_t arrsize) { - int64_t jj = arrsize - 1; - int64_t ii = 0; - int64_t count = 0; + arrsize_t jj = arrsize - 1; + arrsize_t ii = 0; + arrsize_t count = 0; while (ii <= jj) { if (is_a_nan(arr[ii])) { std::swap(arr[ii], arr[jj]); @@ -202,7 +206,7 @@ int64_t move_nans_to_end_of_array(T *arr, int64_t arrsize) } template -bool comparison_func(const T &a, const T &b) +X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b) { return a < b; } @@ -211,16 +215,17 @@ bool comparison_func(const T &a, const T &b) * COEX == Compare and Exchange two registers by swapping min and max values */ template -static void COEX(mm_t &a, mm_t &b) +X86_SIMD_SORT_INLINE void COEX(mm_t &a, mm_t &b) { mm_t temp = a; a = vtype::min(a, b); b = vtype::max(temp, b); } + template -static inline reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask) +X86_SIMD_SORT_INLINE reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask) { reg_t min = vtype::min(in2, in1); reg_t max = vtype::max(in2, in1); @@ -231,13 +236,13 @@ static inline reg_t cmp_merge(reg_t in1, reg_t in2, opmask_t mask) * number of elements that are greater than or equal to the pivot. */ template -static inline int32_t partition_vec(type_t *arr, - int64_t left, - int64_t right, - const reg_t curr_vec, - const reg_t pivot_vec, - reg_t *smallest_vec, - reg_t *biggest_vec) +X86_SIMD_SORT_INLINE int32_t partition_vec(type_t *arr, + arrsize_t left, + arrsize_t right, + const reg_t curr_vec, + const reg_t pivot_vec, + reg_t *smallest_vec, + reg_t *biggest_vec) { /* which elements are larger than or equal to the pivot */ typename vtype::opmask_t ge_mask = vtype::ge(curr_vec, pivot_vec); @@ -255,12 +260,12 @@ static inline int32_t partition_vec(type_t *arr, * first element that is greater than or equal to the pivot. */ template -static inline int64_t partition_avx512(type_t *arr, - int64_t left, - int64_t right, - type_t pivot, - type_t *smallest, - type_t *biggest) +X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr, + arrsize_t left, + arrsize_t right, + type_t pivot, + type_t *smallest, + type_t *biggest) { /* make array length divisible by vtype::numlanes , shortening the array */ for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) { @@ -300,8 +305,8 @@ static inline int64_t partition_avx512(type_t *arr, reg_t vec_left = vtype::loadu(arr + left); reg_t vec_right = vtype::loadu(arr + (right - vtype::numlanes)); // store points of the vectors - int64_t r_store = right - vtype::numlanes; - int64_t l_store = left; + arrsize_t r_store = right - vtype::numlanes; + arrsize_t l_store = left; // indices for loading the elements left += vtype::numlanes; right -= vtype::numlanes; @@ -359,12 +364,12 @@ static inline int64_t partition_avx512(type_t *arr, template -static inline int64_t partition_avx512_unrolled(type_t *arr, - int64_t left, - int64_t right, - type_t pivot, - type_t *smallest, - type_t *biggest) +X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr, + arrsize_t left, + arrsize_t right, + type_t pivot, + type_t *smallest, + type_t *biggest) { if constexpr (num_unroll == 0) { return partition_avx512( @@ -399,15 +404,15 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, // We will now have atleast 16 registers worth of data to process: // left and right vtype::numlanes values are partitioned at the end reg_t vec_left[num_unroll], vec_right[num_unroll]; -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii); vec_right[ii] = vtype::loadu( arr + (right - vtype::numlanes * (num_unroll - ii))); } // store points of the vectors - int64_t r_store = right - vtype::numlanes; - int64_t l_store = left; + arrsize_t r_store = right - vtype::numlanes; + arrsize_t l_store = left; // indices for loading the elements left += num_unroll * vtype::numlanes; right -= num_unroll * vtype::numlanes; @@ -420,20 +425,20 @@ X86_SIMD_SORT_UNROLL_LOOP(8) */ if ((r_store + vtype::numlanes) - right < left - l_store) { right -= num_unroll * vtype::numlanes; -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes); } } else { -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes); } left += num_unroll * vtype::numlanes; } -// partition the current vector and save it on both sides of the array -X86_SIMD_SORT_UNROLL_LOOP(8) + // partition the current vector and save it on both sides of the array + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_ge_pivot = partition_vec(arr, @@ -448,8 +453,8 @@ X86_SIMD_SORT_UNROLL_LOOP(8) } } -/* partition and save vec_left[8] and vec_right[8] */ -X86_SIMD_SORT_UNROLL_LOOP(8) + /* partition and save vec_left[8] and vec_right[8] */ + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_ge_pivot = partition_vec(arr, @@ -462,7 +467,7 @@ X86_SIMD_SORT_UNROLL_LOOP(8) l_store += (vtype::numlanes - amount_ge_pivot); r_store -= amount_ge_pivot; } -X86_SIMD_SORT_UNROLL_LOOP(8) + X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_ge_pivot = partition_vec(arr, @@ -486,7 +491,8 @@ template -static void COEX(reg_t1 &key1, reg_t1 &key2, reg_t2 &index1, reg_t2 &index2) +X86_SIMD_SORT_INLINE void +COEX(reg_t1 &key1, reg_t1 &key2, reg_t2 &index1, reg_t2 &index2) { reg_t1 key_t1 = vtype1::min(key1, key2); reg_t1 key_t2 = vtype1::max(key1, key2); @@ -506,11 +512,11 @@ template -static inline reg_t1 cmp_merge(reg_t1 in1, - reg_t1 in2, - reg_t2 &indexes1, - reg_t2 indexes2, - opmask_t mask) +X86_SIMD_SORT_INLINE reg_t1 cmp_merge(reg_t1 in1, + reg_t1 in2, + reg_t2 &indexes1, + reg_t2 indexes2, + opmask_t mask) { reg_t1 tmp_keys = cmp_merge(in1, in2, mask); indexes1 = vtype2::mask_mov(indexes2, vtype1::eq(tmp_keys, in1), indexes1); @@ -527,15 +533,15 @@ template -static inline int32_t partition_vec(type_t1 *keys, - type_t2 *indexes, - int64_t left, - int64_t right, - const reg_t1 keys_vec, - const reg_t2 indexes_vec, - const reg_t1 pivot_vec, - reg_t1 *smallest_vec, - reg_t1 *biggest_vec) +X86_SIMD_SORT_INLINE int32_t partition_vec(type_t1 *keys, + type_t2 *indexes, + arrsize_t left, + arrsize_t right, + const reg_t1 keys_vec, + const reg_t2 indexes_vec, + const reg_t1 pivot_vec, + reg_t1 *smallest_vec, + reg_t1 *biggest_vec) { /* which elements are larger than the pivot */ typename vtype1::opmask_t gt_mask = vtype1::ge(keys_vec, pivot_vec); @@ -562,13 +568,13 @@ template -static inline int64_t partition_avx512(type_t1 *keys, - type_t2 *indexes, - int64_t left, - int64_t right, - type_t1 pivot, - type_t1 *smallest, - type_t1 *biggest) +X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t1 *keys, + type_t2 *indexes, + arrsize_t left, + arrsize_t right, + type_t1 pivot, + type_t1 *smallest, + type_t1 *biggest) { /* make array length divisible by vtype1::numlanes , shortening the array */ for (int32_t i = (right - left) % vtype1::numlanes; i > 0; --i) { @@ -620,8 +626,8 @@ static inline int64_t partition_avx512(type_t1 *keys, indexes_vec_right = vtype2::loadu(indexes + (right - vtype1::numlanes)); // store points of the vectors - int64_t r_store = right - vtype1::numlanes; - int64_t l_store = left; + arrsize_t r_store = right - vtype1::numlanes; + arrsize_t l_store = left; // indices for loading the elements left += vtype1::numlanes; right -= vtype1::numlanes; @@ -689,13 +695,13 @@ static inline int64_t partition_avx512(type_t1 *keys, template X86_SIMD_SORT_INLINE type_t get_pivot_scalar(type_t *arr, - const int64_t left, - const int64_t right) + const arrsize_t left, + const arrsize_t right) { - constexpr int64_t numSamples = vtype::numlanes; + constexpr arrsize_t numSamples = vtype::numlanes; type_t samples[numSamples]; - int64_t delta = (right - left) / numSamples; + arrsize_t delta = (right - left) / numSamples; for (int i = 0; i < numSamples; i++) { samples[i] = arr[left + i * delta]; @@ -708,11 +714,11 @@ X86_SIMD_SORT_INLINE type_t get_pivot_scalar(type_t *arr, template X86_SIMD_SORT_INLINE type_t get_pivot_16bit(type_t *arr, - const int64_t left, - const int64_t right) + const arrsize_t left, + const arrsize_t right) { // median of 32 - int64_t size = (right - left) / 32; + arrsize_t size = (right - left) / 32; type_t vec_arr[32] = {arr[left], arr[left + size], arr[left + 2 * size], @@ -752,11 +758,11 @@ X86_SIMD_SORT_INLINE type_t get_pivot_16bit(type_t *arr, template X86_SIMD_SORT_INLINE type_t get_pivot_32bit(type_t *arr, - const int64_t left, - const int64_t right) + const arrsize_t left, + const arrsize_t right) { // median of 16 - int64_t size = (right - left) / 16; + arrsize_t size = (right - left) / 16; using reg_t = typename vtype::reg_t; type_t vec_arr[16] = {arr[left + size], arr[left + 2 * size], @@ -782,11 +788,11 @@ X86_SIMD_SORT_INLINE type_t get_pivot_32bit(type_t *arr, template X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr, - const int64_t left, - const int64_t right) + const arrsize_t left, + const arrsize_t right) { // median of 8 - int64_t size = (right - left) / 8; + arrsize_t size = (right - left) / 8; using reg_t = typename vtype::reg_t; reg_t rand_vec = vtype::set(arr[left + size], arr[left + 2 * size], @@ -803,8 +809,8 @@ X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr, template X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr, - const int64_t left, - const int64_t right) + const arrsize_t left, + const arrsize_t right) { if constexpr (vtype::numlanes == 8) return get_pivot_64bit(arr, left, right); @@ -816,11 +822,12 @@ X86_SIMD_SORT_INLINE type_t get_pivot(type_t *arr, return get_pivot_scalar(arr, left, right); } -template -X86_SIMD_SORT_INLINE void sort_n(typename vtype::type_t *arr, int N); +template +void sort_n(typename vtype::type_t *arr, int N); template -static void qsort_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) +X86_SIMD_SORT_INLINE void +qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters) { /* * Resort to std::sort if quicksort isnt making any progress @@ -842,7 +849,7 @@ static void qsort_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); - int64_t pivot_index + arrsize_t pivot_index = partition_avx512_unrolled( arr, left, right + 1, pivot, &smallest, &biggest); @@ -852,11 +859,11 @@ static void qsort_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) } template -static void qselect_(type_t *arr, - int64_t pos, - int64_t left, - int64_t right, - int64_t max_iters) +X86_SIMD_SORT_INLINE void qselect_(type_t *arr, + arrsize_t pos, + arrsize_t left, + arrsize_t right, + arrsize_t max_iters) { /* * Resort to std::sort if quicksort isnt making any progress @@ -878,7 +885,7 @@ static void qselect_(type_t *arr, type_t smallest = vtype::type_max(); type_t biggest = vtype::type_min(); - int64_t pivot_index + arrsize_t pivot_index = partition_avx512_unrolled( arr, left, right + 1, pivot, &smallest, &biggest); @@ -890,30 +897,29 @@ static void qselect_(type_t *arr, // Regular quicksort routines: template -void avx512_qsort(T *arr, int64_t arrsize) +X86_SIMD_SORT_INLINE void avx512_qsort(T *arr, arrsize_t arrsize) { if (arrsize > 1) { /* std::is_floating_point_v<_Float16> == False, unless c++-23*/ if constexpr (std::is_floating_point_v) { - int64_t nan_count + arrsize_t nan_count = replace_nan_with_inf>(arr, arrsize); qsort_, T>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); replace_inf_with_nan(arr, arrsize, nan_count); } else { qsort_, T>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); } } } -void avx512_qsort_fp16(uint16_t *arr, int64_t arrsize); - template -void avx512_qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan = false) +X86_SIMD_SORT_INLINE void +avx512_qselect(T *arr, arrsize_t k, arrsize_t arrsize, bool hasnan = false) { - int64_t indx_last_elem = arrsize - 1; + arrsize_t indx_last_elem = arrsize - 1; /* std::is_floating_point_v<_Float16> == False, unless c++-23*/ if constexpr (std::is_floating_point_v) { if (UNLIKELY(hasnan)) { @@ -922,29 +928,18 @@ void avx512_qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan = false) } if (indx_last_elem >= k) { qselect_, T>( - arr, k, 0, indx_last_elem, 2 * (int64_t)log2(indx_last_elem)); + arr, k, 0, indx_last_elem, 2 * (arrsize_t)log2(indx_last_elem)); } } -void avx512_qselect_fp16(uint16_t *arr, - int64_t k, - int64_t arrsize, - bool hasnan = false); - template -inline void -avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize, bool hasnan = false) +X86_SIMD_SORT_INLINE void avx512_partial_qsort(T *arr, + arrsize_t k, + arrsize_t arrsize, + bool hasnan = false) { avx512_qselect(arr, k - 1, arrsize, hasnan); avx512_qsort(arr, k - 1); } -inline void avx512_partial_qsort_fp16(uint16_t *arr, - int64_t k, - int64_t arrsize, - bool hasnan = false) -{ - avx512_qselect_fp16(arr, k - 1, arrsize, hasnan); - avx512_qsort_fp16(arr, k - 1); -} #endif // AVX512_QSORT_COMMON diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp index 9874b6fd..4aee3de2 100644 --- a/src/avx512fp16-16bit-qsort.hpp +++ b/src/avx512fp16-16bit-qsort.hpp @@ -151,21 +151,21 @@ bool is_a_nan<_Float16>(_Float16 elem) } template <> -void replace_inf_with_nan(_Float16 *arr, int64_t arrsize, int64_t nan_count) +void replace_inf_with_nan(_Float16 *arr, arrsize_t arrsize, arrsize_t nan_count) { memset(arr + arrsize - nan_count, 0xFF, nan_count * 2); } /* Specialized template function for _Float16 qsort_*/ template <> -void avx512_qsort(_Float16 *arr, int64_t arrsize) +void avx512_qsort(_Float16 *arr, arrsize_t arrsize) { if (arrsize > 1) { - int64_t nan_count + arrsize_t nan_count = replace_nan_with_inf, _Float16>(arr, arrsize); qsort_, _Float16>( - arr, 0, arrsize - 1, 2 * (int64_t)log2(arrsize)); + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); replace_inf_with_nan(arr, arrsize, nan_count); } } diff --git a/src/xss-network-qsort.hpp b/src/xss-network-qsort.hpp index 09b68f70..ddfa615a 100644 --- a/src/xss-network-qsort.hpp +++ b/src/xss-network-qsort.hpp @@ -3,16 +3,14 @@ #include "avx512-common-qsort.h" -template +template X86_SIMD_SORT_INLINE void bitonic_clean_n_vec(reg_t *regs) { -X86_SIMD_SORT_UNROLL_LOOP(64) + X86_SIMD_SORT_UNROLL_LOOP(64) for (int num = numVecs / 2; num >= 2; num /= 2) { -X86_SIMD_SORT_UNROLL_LOOP(64) + X86_SIMD_SORT_UNROLL_LOOP(64) for (int j = 0; j < numVecs; j += num) { -X86_SIMD_SORT_UNROLL_LOOP(64) + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = 0; i < num / 2; i++) { COEX(regs[i + j], regs[i + j + num / 2]); } @@ -20,9 +18,7 @@ X86_SIMD_SORT_UNROLL_LOOP(64) } } -template +template X86_SIMD_SORT_INLINE void bitonic_merge_n_vec(reg_t *regs) { // Do the reverse part @@ -31,8 +27,8 @@ X86_SIMD_SORT_INLINE void bitonic_merge_n_vec(reg_t *regs) COEX(regs[0], regs[1]); } else if constexpr (numVecs > 2) { -// Reverse upper half -X86_SIMD_SORT_UNROLL_LOOP(64) + // Reverse upper half + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = 0; i < numVecs / 2; i++) { reg_t rev = vtype::reverse(regs[numVecs - i - 1]); reg_t maxV = vtype::max(regs[i], rev); @@ -45,23 +41,23 @@ X86_SIMD_SORT_UNROLL_LOOP(64) // Call cleaner bitonic_clean_n_vec(regs); -// Now do bitonic_merge -X86_SIMD_SORT_UNROLL_LOOP(64) + // Now do bitonic_merge + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = 0; i < numVecs; i++) { regs[i] = vtype::bitonic_merge(regs[i]); } } template X86_SIMD_SORT_INLINE void bitonic_fullmerge_n_vec(reg_t *regs) { if constexpr (numPer > numVecs) return; else { -X86_SIMD_SORT_UNROLL_LOOP(64) + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = 0; i < numVecs / numPer; i++) { bitonic_merge_n_vec(regs + i * numPer); } @@ -70,7 +66,7 @@ X86_SIMD_SORT_UNROLL_LOOP(64) } template -X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int32_t N) +X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N) { if constexpr (numVecs > 1) { if (N * 2 <= numVecs * vtype::numlanes) { @@ -80,10 +76,10 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int32_t N) } reg_t vecs[numVecs]; - + // Generate masks for loading and storing typename vtype::opmask_t ioMasks[numVecs - numVecs / 2]; -X86_SIMD_SORT_UNROLL_LOOP(64) + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) { int64_t num_to_read = std::min((int64_t)std::max(0, N - i * vtype::numlanes), @@ -91,20 +87,20 @@ X86_SIMD_SORT_UNROLL_LOOP(64) ioMasks[j] = ((0x1ull << num_to_read) - 0x1ull); } -// Unmasked part of the load -X86_SIMD_SORT_UNROLL_LOOP(64) + // Unmasked part of the load + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = 0; i < numVecs / 2; i++) { vecs[i] = vtype::loadu(arr + i * vtype::numlanes); } -// Masked part of the load -X86_SIMD_SORT_UNROLL_LOOP(64) + // Masked part of the load + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) { vecs[i] = vtype::mask_loadu( vtype::zmm_max(), ioMasks[j], arr + i * vtype::numlanes); } -// Sort each loaded vector -X86_SIMD_SORT_UNROLL_LOOP(64) + // Sort each loaded vector + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = 0; i < numVecs; i++) { vecs[i] = vtype::sort_vec(vecs[i]); } @@ -112,19 +108,19 @@ X86_SIMD_SORT_UNROLL_LOOP(64) // Run the full merger bitonic_fullmerge_n_vec(&vecs[0]); -// Unmasked part of the store -X86_SIMD_SORT_UNROLL_LOOP(64) + // Unmasked part of the store + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = 0; i < numVecs / 2; i++) { vtype::storeu(arr + i * vtype::numlanes, vecs[i]); } -// Masked part of the store -X86_SIMD_SORT_UNROLL_LOOP(64) + // Masked part of the store + X86_SIMD_SORT_UNROLL_LOOP(64) for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) { vtype::mask_storeu(arr + i * vtype::numlanes, ioMasks[j], vecs[i]); } } -template +template X86_SIMD_SORT_INLINE void sort_n(typename vtype::type_t *arr, int N) { constexpr int numVecs = maxN / vtype::numlanes; @@ -136,4 +132,4 @@ X86_SIMD_SORT_INLINE void sort_n(typename vtype::type_t *arr, int N) sort_n_vec(arr, N); } -#endif \ No newline at end of file +#endif From 7b443e73ff21f4366d4737917db8db35a0b3528f Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 29 Sep 2023 10:32:06 -0700 Subject: [PATCH 14/23] Change API: use size_t instead if int64_t --- benchmarks/bench-argsort.hpp | 10 ++++---- lib/x86simdsort-icl.cpp | 12 +++++----- lib/x86simdsort-internal.h | 30 +++++++++++------------ lib/x86simdsort-scalar.h | 14 +++++------ lib/x86simdsort-skx.cpp | 10 ++++---- lib/x86simdsort-spr.cpp | 6 ++--- lib/x86simdsort.cpp | 22 ++++++++--------- lib/x86simdsort.h | 10 ++++---- src/avx512-64bit-common.h | 4 ++++ src/avx512-common-qsort.h | 46 ++++++++++++++++++------------------ tests/test-qsort-common.h | 14 +++++------ tests/test-qsort.cpp | 8 +++---- 12 files changed, 95 insertions(+), 91 deletions(-) diff --git a/benchmarks/bench-argsort.hpp b/benchmarks/bench-argsort.hpp index 66bb7bca..0546d7c4 100644 --- a/benchmarks/bench-argsort.hpp +++ b/benchmarks/bench-argsort.hpp @@ -1,11 +1,11 @@ template -std::vector stdargsort(const std::vector &array) +std::vector stdargsort(const std::vector &array) { - std::vector indices(array.size()); + std::vector indices(array.size()); std::iota(indices.begin(), indices.end(), 0); std::sort(indices.begin(), indices.end(), - [&array](int64_t left, int64_t right) -> bool { + [&array](size_t left, size_t right) -> bool { // sort indices according to corresponding array element return array[left] < array[right]; }); @@ -22,7 +22,7 @@ static void scalarargsort(benchmark::State &state, Args &&...args) std::string arrtype = std::get<1>(args_tuple); // set up array std::vector arr = get_array(arrtype, arrsize); - std::vector inx; + std::vector inx; // benchmark for (auto _ : state) { inx = stdargsort(arr); @@ -38,7 +38,7 @@ static void simdargsort(benchmark::State &state, Args &&...args) std::string arrtype = std::get<1>(args_tuple); // set up array std::vector arr = get_array(arrtype, arrsize); - std::vector inx; + std::vector inx; // benchmark for (auto _ : state) { inx = x86simdsort::argsort(arr.data(), arrsize); diff --git a/lib/x86simdsort-icl.cpp b/lib/x86simdsort-icl.cpp index 651d0f1b..2aa3a575 100644 --- a/lib/x86simdsort-icl.cpp +++ b/lib/x86simdsort-icl.cpp @@ -5,32 +5,32 @@ namespace xss { namespace avx512 { template <> - void qsort(uint16_t *arr, int64_t size) + void qsort(uint16_t *arr, size_t size) { avx512_qsort(arr, size); } template <> - void qselect(uint16_t *arr, int64_t k, int64_t arrsize, bool hasnan) + void qselect(uint16_t *arr, size_t k, size_t arrsize, bool hasnan) { avx512_qselect(arr, k, arrsize, hasnan); } template <> - void partial_qsort(uint16_t *arr, int64_t k, int64_t arrsize, bool hasnan) + void partial_qsort(uint16_t *arr, size_t k, size_t arrsize, bool hasnan) { avx512_partial_qsort(arr, k, arrsize, hasnan); } template <> - void qsort(int16_t *arr, int64_t size) + void qsort(int16_t *arr, size_t size) { avx512_qsort(arr, size); } template <> - void qselect(int16_t *arr, int64_t k, int64_t arrsize, bool hasnan) + void qselect(int16_t *arr, size_t k, size_t arrsize, bool hasnan) { avx512_qselect(arr, k, arrsize, hasnan); } template <> - void partial_qsort(int16_t *arr, int64_t k, int64_t arrsize, bool hasnan) + void partial_qsort(int16_t *arr, size_t k, size_t arrsize, bool hasnan) { avx512_partial_qsort(arr, k, arrsize, hasnan); } diff --git a/lib/x86simdsort-internal.h b/lib/x86simdsort-internal.h index 2ddfaefb..d9168d5f 100644 --- a/lib/x86simdsort-internal.h +++ b/lib/x86simdsort-internal.h @@ -7,53 +7,53 @@ namespace xss { namespace avx512 { // quicksort template - void qsort(T *arr, int64_t arrsize); + void qsort(T *arr, size_t arrsize); // quickselect template - void qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); + void qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); // partial sort template - void partial_qsort(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); + void partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); // argsort template - std::vector argsort(T *arr, int64_t arrsize); + std::vector argsort(T *arr, size_t arrsize); // argselect template - std::vector argselect(T *arr, int64_t k, int64_t arrsize); + std::vector argselect(T *arr, size_t k, size_t arrsize); } // namespace avx512 namespace avx2 { // quicksort template - void qsort(T *arr, int64_t arrsize); + void qsort(T *arr, size_t arrsize); // quickselect template - void qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); + void qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); // partial sort template - void partial_qsort(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); + void partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); // argsort template - std::vector argsort(T *arr, int64_t arrsize); + std::vector argsort(T *arr, size_t arrsize); // argselect template - std::vector argselect(T *arr, int64_t k, int64_t arrsize); + std::vector argselect(T *arr, size_t k, size_t arrsize); } // namespace avx2 namespace scalar { // quicksort template - void qsort(T *arr, int64_t arrsize); + void qsort(T *arr, size_t arrsize); // quickselect template - void qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); + void qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); // partial sort template - void partial_qsort(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); + void partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); // argsort template - std::vector argsort(T *arr, int64_t arrsize); + std::vector argsort(T *arr, size_t arrsize); // argselect template - std::vector argselect(T *arr, int64_t k, int64_t arrsize); + std::vector argselect(T *arr, size_t k, size_t arrsize); } // namespace scalar } // namespace xss #endif diff --git a/lib/x86simdsort-scalar.h b/lib/x86simdsort-scalar.h index f3b2b4ac..06ea858a 100644 --- a/lib/x86simdsort-scalar.h +++ b/lib/x86simdsort-scalar.h @@ -5,12 +5,12 @@ namespace xss { namespace scalar { template - void qsort(T *arr, int64_t arrsize) + void qsort(T *arr, size_t arrsize) { std::sort(arr, arr + arrsize, compare>()); } template - void qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan) + void qselect(T *arr, size_t k, size_t arrsize, bool hasnan) { if (hasnan) { std::nth_element(arr, arr + k, arr + arrsize, compare>()); @@ -20,7 +20,7 @@ namespace scalar { } } template - void partial_qsort(T *arr, int64_t k, int64_t arrsize, bool hasnan) + void partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan) { if (hasnan) { std::partial_sort(arr, arr + k, arr + arrsize, compare>()); @@ -30,17 +30,17 @@ namespace scalar { } } template - std::vector argsort(T *arr, int64_t arrsize) + std::vector argsort(T *arr, size_t arrsize) { - std::vector arg(arrsize); + std::vector arg(arrsize); std::iota(arg.begin(), arg.end(), 0); std::sort(arg.begin(), arg.end(), compare_arg>(arr)); return arg; } template - std::vector argselect(T *arr, int64_t k, int64_t arrsize) + std::vector argselect(T *arr, size_t k, size_t arrsize) { - std::vector arg(arrsize); + std::vector arg(arrsize); std::iota(arg.begin(), arg.end(), 0); std::nth_element(arg.begin(), arg.begin() + k, diff --git a/lib/x86simdsort-skx.cpp b/lib/x86simdsort-skx.cpp index 25692e60..4ebb9c11 100644 --- a/lib/x86simdsort-skx.cpp +++ b/lib/x86simdsort-skx.cpp @@ -6,27 +6,27 @@ #define DEFINE_ALL_METHODS(type) \ template <> \ - void qsort(type *arr, int64_t arrsize) \ + void qsort(type *arr, size_t arrsize) \ { \ avx512_qsort(arr, arrsize); \ } \ template <> \ - void qselect(type *arr, int64_t k, int64_t arrsize, bool hasnan) \ + void qselect(type *arr, size_t k, size_t arrsize, bool hasnan) \ { \ avx512_qselect(arr, k, arrsize, hasnan); \ } \ template <> \ - void partial_qsort(type *arr, int64_t k, int64_t arrsize, bool hasnan) \ + void partial_qsort(type *arr, size_t k, size_t arrsize, bool hasnan) \ { \ avx512_partial_qsort(arr, k, arrsize, hasnan); \ } \ template <> \ - std::vector argsort(type *arr, int64_t arrsize) \ + std::vector argsort(type *arr, size_t arrsize) \ { \ return avx512_argsort(arr, arrsize); \ } \ template <> \ - std::vector argselect(type *arr, int64_t k, int64_t arrsize) \ + std::vector argselect(type *arr, size_t k, size_t arrsize) \ { \ return avx512_argselect(arr, k, arrsize); \ } diff --git a/lib/x86simdsort-spr.cpp b/lib/x86simdsort-spr.cpp index dd4c1f17..4672bcb8 100644 --- a/lib/x86simdsort-spr.cpp +++ b/lib/x86simdsort-spr.cpp @@ -5,17 +5,17 @@ namespace xss { namespace avx512 { template <> - void qsort(_Float16 *arr, int64_t size) + void qsort(_Float16 *arr, size_t size) { avx512_qsort(arr, size); } template <> - void qselect(_Float16 *arr, int64_t k, int64_t arrsize, bool hasnan) + void qselect(_Float16 *arr, size_t k, size_t arrsize, bool hasnan) { avx512_qselect(arr, k, arrsize, hasnan); } template <> - void partial_qsort(_Float16 *arr, int64_t k, int64_t arrsize, bool hasnan) + void partial_qsort(_Float16 *arr, size_t k, size_t arrsize, bool hasnan) { avx512_partial_qsort(arr, k, arrsize, hasnan); } diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 3c212de3..db4270dd 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -53,47 +53,47 @@ dispatch_requested(std::string_view cpurequested, #define CAT(a, b) CAT_(a, b) #define DECLARE_INTERNAL_qsort(TYPE) \ - static void (*internal_qsort##TYPE)(TYPE *, int64_t) = NULL; \ + static void (*internal_qsort##TYPE)(TYPE *, size_t) = NULL; \ template <> \ - void qsort(TYPE *arr, int64_t arrsize) \ + void qsort(TYPE *arr, size_t arrsize) \ { \ (*internal_qsort##TYPE)(arr, arrsize); \ } #define DECLARE_INTERNAL_qselect(TYPE) \ - static void (*internal_qselect##TYPE)(TYPE *, int64_t, int64_t, bool) \ + static void (*internal_qselect##TYPE)(TYPE *, size_t, size_t, bool) \ = NULL; \ template <> \ - void qselect(TYPE *arr, int64_t k, int64_t arrsize, bool hasnan) \ + void qselect(TYPE *arr, size_t k, size_t arrsize, bool hasnan) \ { \ (*internal_qselect##TYPE)(arr, k, arrsize, hasnan); \ } #define DECLARE_INTERNAL_partial_qsort(TYPE) \ static void (*internal_partial_qsort##TYPE)( \ - TYPE *, int64_t, int64_t, bool) \ + TYPE *, size_t, size_t, bool) \ = NULL; \ template <> \ - void partial_qsort(TYPE *arr, int64_t k, int64_t arrsize, bool hasnan) \ + void partial_qsort(TYPE *arr, size_t k, size_t arrsize, bool hasnan) \ { \ (*internal_partial_qsort##TYPE)(arr, k, arrsize, hasnan); \ } #define DECLARE_INTERNAL_argsort(TYPE) \ - static std::vector (*internal_argsort##TYPE)(TYPE *, int64_t) \ + static std::vector (*internal_argsort##TYPE)(TYPE *, size_t) \ = NULL; \ template <> \ - std::vector argsort(TYPE *arr, int64_t arrsize) \ + std::vector argsort(TYPE *arr, size_t arrsize) \ { \ return (*internal_argsort##TYPE)(arr, arrsize); \ } #define DECLARE_INTERNAL_argselect(TYPE) \ - static std::vector (*internal_argselect##TYPE)( \ - TYPE *, int64_t, int64_t) \ + static std::vector (*internal_argselect##TYPE)( \ + TYPE *, size_t, size_t) \ = NULL; \ template <> \ - std::vector argselect(TYPE *arr, int64_t k, int64_t arrsize) \ + std::vector argselect(TYPE *arr, size_t k, size_t arrsize) \ { \ return (*internal_argselect##TYPE)(arr, k, arrsize); \ } diff --git a/lib/x86simdsort.h b/lib/x86simdsort.h index a4a1c7f6..9fc40784 100644 --- a/lib/x86simdsort.h +++ b/lib/x86simdsort.h @@ -6,18 +6,18 @@ namespace x86simdsort { // quicksort template -void qsort(T *arr, int64_t arrsize); +void qsort(T *arr, size_t arrsize); // quickselect template -void qselect(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); +void qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); // partial sort template -void partial_qsort(T *arr, int64_t k, int64_t arrsize, bool hasnan = false); +void partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); // argsort template -std::vector argsort(T *arr, int64_t arrsize); +std::vector argsort(T *arr, size_t arrsize); // argselect template -std::vector argselect(T *arr, int64_t k, int64_t arrsize); +std::vector argselect(T *arr, size_t k, size_t arrsize); } // namespace x86simdsort #endif diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index 13713638..4dda6314 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -708,6 +708,10 @@ struct zmm_vector { { return _mm512_mask_compressstoreu_epi64(mem, mask, x); } + static reg_t maskz_loadu(opmask_t mask, void const *mem) + { + return _mm512_maskz_loadu_epi64(mask, mem); + } static reg_t mask_loadu(reg_t x, opmask_t mask, void const *mem) { return _mm512_mask_loadu_epi64(x, mask, mem); diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h index 8a313ddc..cd60575a 100644 --- a/src/avx512-common-qsort.h +++ b/src/avx512-common-qsort.h @@ -100,7 +100,7 @@ #define X86_SIMD_SORT_UNROLL_LOOP(num) #endif -typedef int64_t arrsize_t; +typedef size_t arrsize_t; template struct zmm_vector; @@ -115,49 +115,49 @@ bool is_a_nan(T elem) } template -X86_SIMD_SORT_INLINE arrsize_t replace_nan_with_inf(T *arr, arrsize_t arrsize) +X86_SIMD_SORT_INLINE arrsize_t replace_nan_with_inf(T *arr, arrsize_t size) { arrsize_t nan_count = 0; using opmask_t = typename vtype::opmask_t; using reg_t = typename vtype::reg_t; opmask_t loadmask; reg_t in; - while (arrsize > 0) { - if (arrsize < vtype::numlanes) { - loadmask = vtype::get_partial_loadmask(arrsize); - in = vtype::maskz_loadu(loadmask, arr); + /* + * (ii + numlanes) can never overflow: max val of size is 2**63 on 64-bit + * and 2**31 on 32-bit systems + */ + for (arrsize_t ii = 0; ii < size; ii = ii + vtype::numlanes) { + if (size - ii < vtype::numlanes) { + loadmask = vtype::get_partial_loadmask(size - ii); + in = vtype::maskz_loadu(loadmask, arr + ii); } else { - in = vtype::loadu(arr); + in = vtype::loadu(arr + ii); } opmask_t nanmask = vtype::template fpclass<0x01 | 0x80>(in); nan_count += _mm_popcnt_u32((int32_t)nanmask); - vtype::mask_storeu(arr, nanmask, vtype::zmm_max()); - arr += vtype::numlanes; - arrsize -= vtype::numlanes; + vtype::mask_storeu(arr + ii, nanmask, vtype::zmm_max()); } return nan_count; } template -X86_SIMD_SORT_INLINE bool has_nan(type_t *arr, arrsize_t arrsize) +X86_SIMD_SORT_INLINE bool has_nan(type_t *arr, arrsize_t size) { using opmask_t = typename vtype::opmask_t; using reg_t = typename vtype::reg_t; bool found_nan = false; opmask_t loadmask; reg_t in; - while (arrsize > 0) { - if (arrsize < vtype::numlanes) { - loadmask = vtype::get_partial_loadmask(arrsize); - in = vtype::maskz_loadu(loadmask, arr); + for (arrsize_t ii = 0; ii < size; ii = ii + vtype::numlanes) { + if (size - ii < vtype::numlanes) { + loadmask = vtype::get_partial_loadmask(size - ii); + in = vtype::maskz_loadu(loadmask, arr + ii); } else { - in = vtype::loadu(arr); + in = vtype::loadu(arr + ii); } opmask_t nanmask = vtype::template fpclass<0x01 | 0x80>(in); - arr += vtype::numlanes; - arrsize -= vtype::numlanes; if (nanmask != 0x00) { found_nan = true; break; @@ -168,9 +168,9 @@ X86_SIMD_SORT_INLINE bool has_nan(type_t *arr, arrsize_t arrsize) template X86_SIMD_SORT_INLINE void -replace_inf_with_nan(type_t *arr, arrsize_t arrsize, arrsize_t nan_count) +replace_inf_with_nan(type_t *arr, arrsize_t size, arrsize_t nan_count) { - for (arrsize_t ii = arrsize - 1; nan_count > 0; --ii) { + for (arrsize_t ii = size - 1; nan_count > 0; --ii) { if constexpr (std::is_floating_point_v) { arr[ii] = std::numeric_limits::quiet_NaN(); } @@ -187,9 +187,9 @@ replace_inf_with_nan(type_t *arr, arrsize_t arrsize, arrsize_t nan_count) */ template X86_SIMD_SORT_INLINE arrsize_t move_nans_to_end_of_array(T *arr, - arrsize_t arrsize) + arrsize_t size) { - arrsize_t jj = arrsize - 1; + arrsize_t jj = size - 1; arrsize_t ii = 0; arrsize_t count = 0; while (ii <= jj) { @@ -202,7 +202,7 @@ X86_SIMD_SORT_INLINE arrsize_t move_nans_to_end_of_array(T *arr, ii += 1; } } - return arrsize - count - 1; + return size - count - 1; } template diff --git a/tests/test-qsort-common.h b/tests/test-qsort-common.h index f4f15f3d..00ab80f9 100644 --- a/tests/test-qsort-common.h +++ b/tests/test-qsort-common.h @@ -9,7 +9,7 @@ #define EXPECT_UNIQUE(arg) \ auto sorted_arg = arg; \ std::sort(sorted_arg.begin(), sorted_arg.end()); \ - std::vector expected_arg(sorted_arg.size()); \ + std::vector expected_arg(sorted_arg.size()); \ std::iota(expected_arg.begin(), expected_arg.end(), 0); \ EXPECT_EQ(sorted_arg, expected_arg) \ << "Indices aren't unique. Array size = " << sorted_arg.size(); @@ -29,7 +29,7 @@ void IS_SORTED(std::vector sorted, std::vector arr, std::string type) template void IS_ARG_SORTED(std::vector sortedarr, std::vector arr, - std::vector arg, + std::vector arg, std::string type) { EXPECT_UNIQUE(arg) @@ -42,7 +42,7 @@ void IS_ARG_SORTED(std::vector sortedarr, template void IS_ARR_PARTITIONED(std::vector arr, - int64_t k, + size_t k, T true_kth, std::string type) { @@ -64,7 +64,7 @@ void IS_ARR_PARTITIONED(std::vector arr, } } // 3) Elements to the right of k should be atleast arr[k] - if (k != (int64_t)(arr.size() - 1)) { + if (k != (size_t)(arr.size() - 1)) { T min_right = *std::min_element(arr.begin() + k + 1, arr.end(), cmp_less); if (!cmp_leq(arr[k], min_right)) { @@ -75,7 +75,7 @@ void IS_ARR_PARTITIONED(std::vector arr, template void IS_ARR_PARTIALSORTED(std::vector arr, - int64_t k, + size_t k, std::vector sorted, std::string type) { @@ -86,9 +86,9 @@ void IS_ARR_PARTIALSORTED(std::vector arr, template void IS_ARG_PARTITIONED(std::vector arr, - std::vector arg, + std::vector arg, T true_kth, - int64_t k, + size_t k, std::string type) { EXPECT_UNIQUE(arg) diff --git a/tests/test-qsort.cpp b/tests/test-qsort.cpp index 40098a46..7ecd1a13 100644 --- a/tests/test-qsort.cpp +++ b/tests/test-qsort.cpp @@ -21,7 +21,7 @@ class simdsort : public ::testing::Test { "rand_with_nan"}; } std::vector arrtype; - std::vector arrsize = std::vector(1024); + std::vector arrsize = std::vector(1024); }; TYPED_TEST_SUITE_P(simdsort); @@ -64,7 +64,7 @@ TYPED_TEST_P(simdsort, test_qselect) { for (auto type : this->arrtype) { for (auto size : this->arrsize) { - int64_t k = rand() % size; + size_t k = rand() % size; std::vector arr = get_array(type, size); std::vector sortedarr = arr; std::nth_element(sortedarr.begin(), @@ -83,7 +83,7 @@ TYPED_TEST_P(simdsort, test_argselect) { for (auto type : this->arrtype) { for (auto size : this->arrsize) { - int64_t k = rand() % size; + size_t k = rand() % size; std::vector arr = get_array(type, size); std::vector sortedarr = arr; std::sort(sortedarr.begin(), @@ -103,7 +103,7 @@ TYPED_TEST_P(simdsort, test_partial_qsort) for (auto type : this->arrtype) { for (auto size : this->arrsize) { // k should be at least 1 - int64_t k = std::max(0x1l, rand() % size); + size_t k = std::max(0x1ul, rand() % size); std::vector arr = get_array(type, size); std::vector sortedarr = arr; std::sort(sortedarr.begin(), From c59c35b390737b796934f333f55953ff0e457375 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 29 Sep 2023 13:25:11 -0700 Subject: [PATCH 15/23] Expose only the necessary symbols --- lib/meson.build | 6 +++--- lib/x86simdsort-internal.h | 20 ++++++++++++++++++-- lib/x86simdsort.cpp | 2 +- lib/x86simdsort.h | 8 ++++++++ meson.build | 24 +++++++++++++++--------- src/avx512-32bit-qsort.hpp | 4 ++-- src/avx512-64bit-common.h | 4 ++-- 7 files changed, 49 insertions(+), 19 deletions(-) diff --git a/lib/meson.build b/lib/meson.build index 8bfc75f0..0f021bab 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -6,7 +6,7 @@ if cpp.has_argument('-march=skylake-avx512') 'x86simdsort-skx.cpp', ), include_directories : [src], - cpp_args : ['-O3', '-mavx512f', '-mavx512dq', '-mavx512vl'], + cpp_args : ['-O3', '-march=skylake-avx512', flags_hide_symbols], ) endif @@ -16,7 +16,7 @@ if cpp.has_argument('-march=icelake-client') 'x86simdsort-icl.cpp', ), include_directories : [src], - cpp_args : ['-O3', '-mavx512f', '-mavx512vbmi2', '-mavx512bw', '-mavx512vl', '-mf16c'], + cpp_args : ['-O3', '-march=icelake-client', flags_hide_symbols], ) endif @@ -26,6 +26,6 @@ if cancompilefp16 'x86simdsort-spr.cpp', ), include_directories : [src], - cpp_args : ['-O3', '-mavx512f', '-mavx512fp16', '-mavx512vbmi2'], + cpp_args : ['-O3', 'march=sapphirerapids', flags_hide_symbols], ) endif diff --git a/lib/x86simdsort-internal.h b/lib/x86simdsort-internal.h index d9168d5f..d9618d93 100644 --- a/lib/x86simdsort-internal.h +++ b/lib/x86simdsort-internal.h @@ -1,58 +1,74 @@ -#ifndef XSS_ALL_METHODS -#define XSS_ALL_METHODS +#ifndef XSS_INTERNAL_METHODS +#define XSS_INTERNAL_METHODS #include #include +#include "x86simdsort.h" namespace xss { namespace avx512 { // quicksort template + XSS_HIDE_SYMBOL void qsort(T *arr, size_t arrsize); // quickselect template + XSS_HIDE_SYMBOL void qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); // partial sort template + XSS_HIDE_SYMBOL void partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); // argsort template + XSS_HIDE_SYMBOL std::vector argsort(T *arr, size_t arrsize); // argselect template + XSS_HIDE_SYMBOL std::vector argselect(T *arr, size_t k, size_t arrsize); } // namespace avx512 namespace avx2 { // quicksort template + XSS_HIDE_SYMBOL void qsort(T *arr, size_t arrsize); // quickselect template + XSS_HIDE_SYMBOL void qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); // partial sort template + XSS_HIDE_SYMBOL void partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); // argsort template + XSS_HIDE_SYMBOL std::vector argsort(T *arr, size_t arrsize); // argselect template + XSS_HIDE_SYMBOL std::vector argselect(T *arr, size_t k, size_t arrsize); } // namespace avx2 namespace scalar { // quicksort template + XSS_HIDE_SYMBOL void qsort(T *arr, size_t arrsize); // quickselect template + XSS_HIDE_SYMBOL void qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); // partial sort template + XSS_HIDE_SYMBOL void partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); // argsort template + XSS_HIDE_SYMBOL std::vector argsort(T *arr, size_t arrsize); // argselect template + XSS_HIDE_SYMBOL std::vector argselect(T *arr, size_t k, size_t arrsize); } // namespace scalar } // namespace xss diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index db4270dd..57568e1f 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -31,7 +31,7 @@ static int check_cpu_feature_support(std::string_view cpufeature) } std::string_view -find_preferred_cpu(std::initializer_list cpulist) +static find_preferred_cpu(std::initializer_list cpulist) { for (auto cpu : cpulist) { if (check_cpu_feature_support(cpu)) return cpu; diff --git a/lib/x86simdsort.h b/lib/x86simdsort.h index 9fc40784..6b424a8e 100644 --- a/lib/x86simdsort.h +++ b/lib/x86simdsort.h @@ -3,21 +3,29 @@ #include #include +#define XSS_EXPORT_SYMBOL __attribute__ ((visibility ("default"))) +#define XSS_HIDE_SYMBOL __attribute__ ((visibility ("hidden"))) + namespace x86simdsort { // quicksort template +XSS_EXPORT_SYMBOL void qsort(T *arr, size_t arrsize); // quickselect template +XSS_EXPORT_SYMBOL void qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); // partial sort template +XSS_EXPORT_SYMBOL void partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); // argsort template +XSS_EXPORT_SYMBOL std::vector argsort(T *arr, size_t arrsize); // argselect template +XSS_EXPORT_SYMBOL std::vector argselect(T *arr, size_t k, size_t arrsize); } // namespace x86simdsort #endif diff --git a/meson.build b/meson.build index 1de2036f..2bf65ca0 100644 --- a/meson.build +++ b/meson.build @@ -8,8 +8,8 @@ lib = include_directories('lib') bench = include_directories('benchmarks') utils = include_directories('utils') tests = include_directories('tests') -gtest_dep = dependency('gtest_main', required : true, static: true) -gbench_dep = dependency('benchmark', required : true, static: true) +gtest_dep = dependency('gtest_main', required : false, static: false) +gbench_dep = dependency('benchmark', required : false, static: false) fp16code = '''#include int main() { @@ -19,35 +19,41 @@ int main() { } ''' cancompilefp16 = cpp.compiles(fp16code, args:'-march=sapphirerapids') +flags_hide_symbols = ['-fvisibility=hidden', '-fvisibility-inlines-hidden'] subdir('lib') -subdir('tests') -subdir('benchmarks') - libsimdsort = shared_library('x86simdsort', 'lib/x86simdsort.cpp', include_directories : [utils, lib], - link_whole : [libtargets], - cpp_args : ['-O3'], + link_with : [libtargets], + cpp_args : ['-O3', flags_hide_symbols], ) -testexe = executable('testexe', +if gtest_dep.found() + subdir('tests') + testexe = executable('testexe', include_directories : [lib, utils], dependencies : gtest_dep, link_whole : [libtests], link_with : libsimdsort, ) +endif -benchexe = executable('benchexe', +if gbench_dep.found() + subdir('benchmarks') + benchexe = executable('benchexe', include_directories : [src, lib, utils, bench], dependencies : [gbench_dep], link_args: ['-lbenchmark_main'], link_whole : [libbench], link_with : libsimdsort, ) +endif summary({ 'Can compile AVX-512 FP16 ISA': cancompilefp16, + 'Built test content': gtest_dep.found(), + 'Built benchmarks': gbench_dep.found(), }, section: 'Configuration', bool_yn: true diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp index 1faeb003..fd427c28 100644 --- a/src/avx512-32bit-qsort.hpp +++ b/src/avx512-32bit-qsort.hpp @@ -25,10 +25,10 @@ #define NETWORK_32BIT_7 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 template -reg_t sort_zmm_32bit(reg_t zmm); +X86_SIMD_SORT_INLINE reg_t sort_zmm_32bit(reg_t zmm); template -reg_t bitonic_merge_zmm_32bit(reg_t zmm); +X86_SIMD_SORT_INLINE reg_t bitonic_merge_zmm_32bit(reg_t zmm); template <> struct zmm_vector { diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index 4dda6314..f9018231 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -20,10 +20,10 @@ #define NETWORK_64BIT_4 3, 2, 1, 0, 7, 6, 5, 4 template -reg_t sort_zmm_64bit(reg_t zmm); +X86_SIMD_SORT_INLINE reg_t sort_zmm_64bit(reg_t zmm); template -reg_t bitonic_merge_zmm_64bit(reg_t zmm); +X86_SIMD_SORT_INLINE reg_t bitonic_merge_zmm_64bit(reg_t zmm); template <> struct ymm_vector { From d8e0a202df3db93ab3baf34db762a4b46fccc6bd Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 29 Sep 2023 14:04:53 -0700 Subject: [PATCH 16/23] Fix typo --- lib/meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/meson.build b/lib/meson.build index 0f021bab..e567dde2 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -26,6 +26,6 @@ if cancompilefp16 'x86simdsort-spr.cpp', ), include_directories : [src], - cpp_args : ['-O3', 'march=sapphirerapids', flags_hide_symbols], + cpp_args : ['-O3', '-march=sapphirerapids', flags_hide_symbols], ) endif From cce8012fe809e4aedff4deb40553caf5f6d2c266 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 29 Sep 2023 14:20:06 -0700 Subject: [PATCH 17/23] Run clang format --- benchmarks/bench-all.cpp | 17 +++++++----- benchmarks/bench-qsort.hpp | 1 - lib/x86simdsort-internal.h | 56 +++++++++++++++++--------------------- lib/x86simdsort-scalar.h | 8 ++++-- lib/x86simdsort.cpp | 7 ++--- lib/x86simdsort.h | 22 +++++++-------- src/avx512-common-qsort.h | 3 +- tests/test-qsort-common.h | 2 +- utils/custom-float.h | 1 + 9 files changed, 56 insertions(+), 61 deletions(-) diff --git a/benchmarks/bench-all.cpp b/benchmarks/bench-all.cpp index d445bc63..23fc17a0 100644 --- a/benchmarks/bench-all.cpp +++ b/benchmarks/bench-all.cpp @@ -1,5 +1,5 @@ -#include "x86simdsort.h" #include "rand_array.h" +#include "x86simdsort.h" #include #define MY_BENCHMARK_CAPTURE(func, T, test_case_name, ...) \ @@ -12,10 +12,14 @@ }))) #define BENCH_SORT(func, type) \ - MY_BENCHMARK_CAPTURE(func, type, smallrandom_128, 128, std::string("random")); \ - MY_BENCHMARK_CAPTURE(func, type, smallrandom_256, 256, std::string("random")); \ - MY_BENCHMARK_CAPTURE(func, type, smallrandom_512, 512, std::string("random")); \ - MY_BENCHMARK_CAPTURE(func, type, smallrandom_1k, 1024, std::string("random")); \ + MY_BENCHMARK_CAPTURE( \ + func, type, smallrandom_128, 128, std::string("random")); \ + MY_BENCHMARK_CAPTURE( \ + func, type, smallrandom_256, 256, std::string("random")); \ + MY_BENCHMARK_CAPTURE( \ + func, type, smallrandom_512, 512, std::string("random")); \ + MY_BENCHMARK_CAPTURE( \ + func, type, smallrandom_1k, 1024, std::string("random")); \ MY_BENCHMARK_CAPTURE(func, type, random_5k, 5000, std::string("random")); \ MY_BENCHMARK_CAPTURE( \ func, type, random_100k, 100000, std::string("random")); \ @@ -34,10 +38,9 @@ MY_BENCHMARK_CAPTURE(func, type, k10, 10000, 10); \ MY_BENCHMARK_CAPTURE(func, type, k100, 10000, 100); \ MY_BENCHMARK_CAPTURE(func, type, k1000, 10000, 1000); \ - MY_BENCHMARK_CAPTURE(func, type, k5000, 10000, 5000); \ + MY_BENCHMARK_CAPTURE(func, type, k5000, 10000, 5000); #include "bench-argsort.hpp" #include "bench-partial-qsort.hpp" #include "bench-qselect.hpp" #include "bench-qsort.hpp" - diff --git a/benchmarks/bench-qsort.hpp b/benchmarks/bench-qsort.hpp index 277f7bf5..f95b05ba 100644 --- a/benchmarks/bench-qsort.hpp +++ b/benchmarks/bench-qsort.hpp @@ -51,4 +51,3 @@ BENCH_BOTH_QSORT(double) #ifdef __FLT16_MAX__ BENCH_BOTH_QSORT(_Float16) #endif - diff --git a/lib/x86simdsort-internal.h b/lib/x86simdsort-internal.h index d9618d93..7e716e8d 100644 --- a/lib/x86simdsort-internal.h +++ b/lib/x86simdsort-internal.h @@ -1,75 +1,69 @@ #ifndef XSS_INTERNAL_METHODS #define XSS_INTERNAL_METHODS +#include "x86simdsort.h" #include #include -#include "x86simdsort.h" namespace xss { namespace avx512 { // quicksort template - XSS_HIDE_SYMBOL - void qsort(T *arr, size_t arrsize); + XSS_HIDE_SYMBOL void qsort(T *arr, size_t arrsize); // quickselect template - XSS_HIDE_SYMBOL - void qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); + XSS_HIDE_SYMBOL void + qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); // partial sort template - XSS_HIDE_SYMBOL - void partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); + XSS_HIDE_SYMBOL void + partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); // argsort template - XSS_HIDE_SYMBOL - std::vector argsort(T *arr, size_t arrsize); + XSS_HIDE_SYMBOL std::vector argsort(T *arr, size_t arrsize); // argselect template - XSS_HIDE_SYMBOL - std::vector argselect(T *arr, size_t k, size_t arrsize); + XSS_HIDE_SYMBOL std::vector + argselect(T *arr, size_t k, size_t arrsize); } // namespace avx512 namespace avx2 { // quicksort template - XSS_HIDE_SYMBOL - void qsort(T *arr, size_t arrsize); + XSS_HIDE_SYMBOL void qsort(T *arr, size_t arrsize); // quickselect template - XSS_HIDE_SYMBOL - void qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); + XSS_HIDE_SYMBOL void + qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); // partial sort template - XSS_HIDE_SYMBOL - void partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); + XSS_HIDE_SYMBOL void + partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); // argsort template - XSS_HIDE_SYMBOL - std::vector argsort(T *arr, size_t arrsize); + XSS_HIDE_SYMBOL std::vector argsort(T *arr, size_t arrsize); // argselect template - XSS_HIDE_SYMBOL - std::vector argselect(T *arr, size_t k, size_t arrsize); + XSS_HIDE_SYMBOL std::vector + argselect(T *arr, size_t k, size_t arrsize); } // namespace avx2 namespace scalar { // quicksort template - XSS_HIDE_SYMBOL - void qsort(T *arr, size_t arrsize); + XSS_HIDE_SYMBOL void qsort(T *arr, size_t arrsize); // quickselect template - XSS_HIDE_SYMBOL - void qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); + XSS_HIDE_SYMBOL void + qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); // partial sort template - XSS_HIDE_SYMBOL - void partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); + XSS_HIDE_SYMBOL void + partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); // argsort template - XSS_HIDE_SYMBOL - std::vector argsort(T *arr, size_t arrsize); + XSS_HIDE_SYMBOL std::vector argsort(T *arr, size_t arrsize); // argselect template - XSS_HIDE_SYMBOL - std::vector argselect(T *arr, size_t k, size_t arrsize); + XSS_HIDE_SYMBOL std::vector + argselect(T *arr, size_t k, size_t arrsize); } // namespace scalar } // namespace xss #endif diff --git a/lib/x86simdsort-scalar.h b/lib/x86simdsort-scalar.h index 06ea858a..6e8d67bf 100644 --- a/lib/x86simdsort-scalar.h +++ b/lib/x86simdsort-scalar.h @@ -1,6 +1,6 @@ +#include "custom-compare.h" #include #include -#include "custom-compare.h" namespace xss { namespace scalar { @@ -13,7 +13,8 @@ namespace scalar { void qselect(T *arr, size_t k, size_t arrsize, bool hasnan) { if (hasnan) { - std::nth_element(arr, arr + k, arr + arrsize, compare>()); + std::nth_element( + arr, arr + k, arr + arrsize, compare>()); } else { std::nth_element(arr, arr + k, arr + arrsize); @@ -23,7 +24,8 @@ namespace scalar { void partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan) { if (hasnan) { - std::partial_sort(arr, arr + k, arr + arrsize, compare>()); + std::partial_sort( + arr, arr + k, arr + arrsize, compare>()); } else { std::partial_sort(arr, arr + k, arr + arrsize); diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 57568e1f..091d717a 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -30,8 +30,8 @@ static int check_cpu_feature_support(std::string_view cpufeature) return 0; } -std::string_view -static find_preferred_cpu(std::initializer_list cpulist) +std::string_view static find_preferred_cpu( + std::initializer_list cpulist) { for (auto cpu : cpulist) { if (check_cpu_feature_support(cpu)) return cpu; @@ -70,8 +70,7 @@ dispatch_requested(std::string_view cpurequested, } #define DECLARE_INTERNAL_partial_qsort(TYPE) \ - static void (*internal_partial_qsort##TYPE)( \ - TYPE *, size_t, size_t, bool) \ + static void (*internal_partial_qsort##TYPE)(TYPE *, size_t, size_t, bool) \ = NULL; \ template <> \ void partial_qsort(TYPE *arr, size_t k, size_t arrsize, bool hasnan) \ diff --git a/lib/x86simdsort.h b/lib/x86simdsort.h index 6b424a8e..e3f54b39 100644 --- a/lib/x86simdsort.h +++ b/lib/x86simdsort.h @@ -3,29 +3,27 @@ #include #include -#define XSS_EXPORT_SYMBOL __attribute__ ((visibility ("default"))) -#define XSS_HIDE_SYMBOL __attribute__ ((visibility ("hidden"))) +#define XSS_EXPORT_SYMBOL __attribute__((visibility("default"))) +#define XSS_HIDE_SYMBOL __attribute__((visibility("hidden"))) namespace x86simdsort { // quicksort template -XSS_EXPORT_SYMBOL -void qsort(T *arr, size_t arrsize); +XSS_EXPORT_SYMBOL void qsort(T *arr, size_t arrsize); // quickselect template -XSS_EXPORT_SYMBOL -void qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); +XSS_EXPORT_SYMBOL void +qselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); // partial sort template -XSS_EXPORT_SYMBOL -void partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); +XSS_EXPORT_SYMBOL void +partial_qsort(T *arr, size_t k, size_t arrsize, bool hasnan = false); // argsort template -XSS_EXPORT_SYMBOL -std::vector argsort(T *arr, size_t arrsize); +XSS_EXPORT_SYMBOL std::vector argsort(T *arr, size_t arrsize); // argselect template -XSS_EXPORT_SYMBOL -std::vector argselect(T *arr, size_t k, size_t arrsize); +XSS_EXPORT_SYMBOL std::vector +argselect(T *arr, size_t k, size_t arrsize); } // namespace x86simdsort #endif diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h index cd60575a..89901a20 100644 --- a/src/avx512-common-qsort.h +++ b/src/avx512-common-qsort.h @@ -186,8 +186,7 @@ replace_inf_with_nan(type_t *arr, arrsize_t size, arrsize_t nan_count) * in the array which is not a nan */ template -X86_SIMD_SORT_INLINE arrsize_t move_nans_to_end_of_array(T *arr, - arrsize_t size) +X86_SIMD_SORT_INLINE arrsize_t move_nans_to_end_of_array(T *arr, arrsize_t size) { arrsize_t jj = size - 1; arrsize_t ii = 0; diff --git a/tests/test-qsort-common.h b/tests/test-qsort-common.h index 00ab80f9..6b8241b3 100644 --- a/tests/test-qsort-common.h +++ b/tests/test-qsort-common.h @@ -1,8 +1,8 @@ #ifndef AVX512_TEST_COMMON #define AVX512_TEST_COMMON -#include "rand_array.h" #include "custom-compare.h" +#include "rand_array.h" #include "x86simdsort.h" #include diff --git a/utils/custom-float.h b/utils/custom-float.h index 7c823f0c..001d4245 100644 --- a/utils/custom-float.h +++ b/utils/custom-float.h @@ -1,5 +1,6 @@ #ifndef UTILS_FLOAT #define UTILS_FLOAT +#include namespace xss { namespace fp { From f568e9d9d3aa9bdd0e3fedbf1c30bfc198f4442d Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 2 Oct 2023 11:17:50 -0700 Subject: [PATCH 18/23] Add API used by NumPy --- src/avx512-64bit-argsort.hpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/avx512-64bit-argsort.hpp b/src/avx512-64bit-argsort.hpp index 1a20c7c0..4571a469 100644 --- a/src/avx512-64bit-argsort.hpp +++ b/src/avx512-64bit-argsort.hpp @@ -408,4 +408,20 @@ avx512_argselect(T *arr, arrsize_t k, arrsize_t arrsize) return indices; } +/* To maintain compatibility with NumPy build */ +template +X86_SIMD_SORT_INLINE void +avx512_argselect(T *arr, int64_t *arg, arrsize_t k, arrsize_t arrsize) +{ + avx512_argselect(arr, reinterpret_cast(arg), k, arrsize); +} + +template +X86_SIMD_SORT_INLINE void +avx512_argsort(T *arr, int64_t *arg, arrsize_t arrsize) +{ + avx512_argsort(arr, reinterpret_cast(arg), arrsize); +} + + #endif // AVX512_ARGSORT_64BIT From 6ec7976dad9edd2834e4da8d05a6886716476247 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 2 Oct 2023 11:23:18 -0700 Subject: [PATCH 19/23] Add SPR to CI run --- .github/workflows/c-cpp.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 84a49c82..3e547824 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -50,3 +50,6 @@ jobs: - name: Run test suite on TGL run: sde -tgl -- ./builddir/testexe + + - name: Run test suite on SPR + run: sde -spr -- ./builddir/testexe From 121458b5101a83e2b9ccb5f1b407f241e87d50bb Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 2 Oct 2023 15:25:18 -0700 Subject: [PATCH 20/23] Use intel CI runners --- .github/workflows/build-numpy.yml | 2 +- .github/workflows/c-cpp.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-numpy.yml b/.github/workflows/build-numpy.yml index cda490f9..98a01db7 100644 --- a/.github/workflows/build-numpy.yml +++ b/.github/workflows/build-numpy.yml @@ -11,7 +11,7 @@ on: jobs: NumPyMultiarrayTests: - runs-on: ubuntu-latest + runs-on: intel-ubuntu-latest steps: - name: Checkout x86-simd-sort diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 3e547824..e1e102a0 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -9,7 +9,7 @@ on: jobs: tests: - runs-on: ubuntu-latest + runs-on: intel-ubuntu-latest steps: - uses: actions/checkout@v3 From 336c99891014000de018b990bac6658ee02c9434 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 3 Oct 2023 10:02:04 -0700 Subject: [PATCH 21/23] Dont explicitly build with -O3, use meson release buildtype instead --- Makefile | 7 ++++++- lib/meson.build | 6 +++--- meson.build | 2 +- scripts/bench-compare.sh | 2 +- scripts/branch-compare.sh | 2 +- tests/meson.build | 1 - 6 files changed, 12 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 27302673..f25c8dad 100644 --- a/Makefile +++ b/Makefile @@ -75,9 +75,14 @@ benchexe: $(BENCHOBJS) $(UTILOBJS) .PHONY: meson meson: - meson setup --warnlevel 2 --werror --buildtype plain builddir + meson setup --warnlevel 2 --werror --buildtype release builddir cd builddir && ninja +.PHONY: mesondebug +mesondebug: + meson setup --warnlevel 2 --werror --buildtype debug debug + cd debug && ninja + .PHONY: clean clean: $(RM) -rf $(TESTOBJS) $(BENCHOBJS) $(UTILOBJS) testexe benchexe builddir diff --git a/lib/meson.build b/lib/meson.build index e567dde2..fc544701 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -6,7 +6,7 @@ if cpp.has_argument('-march=skylake-avx512') 'x86simdsort-skx.cpp', ), include_directories : [src], - cpp_args : ['-O3', '-march=skylake-avx512', flags_hide_symbols], + cpp_args : ['-march=skylake-avx512', flags_hide_symbols], ) endif @@ -16,7 +16,7 @@ if cpp.has_argument('-march=icelake-client') 'x86simdsort-icl.cpp', ), include_directories : [src], - cpp_args : ['-O3', '-march=icelake-client', flags_hide_symbols], + cpp_args : ['-march=icelake-client', flags_hide_symbols], ) endif @@ -26,6 +26,6 @@ if cancompilefp16 'x86simdsort-spr.cpp', ), include_directories : [src], - cpp_args : ['-O3', '-march=sapphirerapids', flags_hide_symbols], + cpp_args : ['-march=sapphirerapids', flags_hide_symbols], ) endif diff --git a/meson.build b/meson.build index 2bf65ca0..c90e6eae 100644 --- a/meson.build +++ b/meson.build @@ -26,7 +26,7 @@ libsimdsort = shared_library('x86simdsort', 'lib/x86simdsort.cpp', include_directories : [utils, lib], link_with : [libtargets], - cpp_args : ['-O3', flags_hide_symbols], + cpp_args : [flags_hide_symbols], ) if gtest_dep.found() diff --git a/scripts/bench-compare.sh b/scripts/bench-compare.sh index 498a6b66..57347cce 100755 --- a/scripts/bench-compare.sh +++ b/scripts/bench-compare.sh @@ -11,7 +11,7 @@ if [ ! -d .bench/google-benchmark ]; then fi compare=$(realpath .bench/google-benchmark/tools/compare.py) -meson setup --warnlevel 0 --buildtype plain builddir-${branch} +meson setup --warnlevel 0 --buildtype release builddir-${branch} cd builddir-${branch} ninja $compare filters ./benchexe $1 $2 diff --git a/scripts/branch-compare.sh b/scripts/branch-compare.sh index ff8b3474..6b6b6610 100755 --- a/scripts/branch-compare.sh +++ b/scripts/branch-compare.sh @@ -26,7 +26,7 @@ build_branch() { fi fi cd $dir_name - meson setup --warnlevel 0 --buildtype plain builddir + meson setup --warnlevel 0 --buildtype release builddir cd builddir ninja cd ../../ diff --git a/tests/meson.build b/tests/meson.build index fece2415..172ddf01 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -4,7 +4,6 @@ libtests += static_library('tests_qsort', files('test-qsort.cpp', ), dependencies: gtest_dep, include_directories : [lib, utils], - cpp_args : ['-O3'], ) #if cancompilefp16 From dee950586df36c859b33b5d558b7d3df663c4a6a Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 3 Oct 2023 11:41:01 -0700 Subject: [PATCH 22/23] Fix NAN check for _Float16 --- src/avx512-16bit-qsort.hpp | 3 ++- src/avx512-common-qsort.h | 6 +++++- src/avx512fp16-16bit-qsort.hpp | 27 +++++++++++++++++++++------ 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp index a204310d..edd118b3 100644 --- a/src/avx512-16bit-qsort.hpp +++ b/src/avx512-16bit-qsort.hpp @@ -445,7 +445,8 @@ arrsize_t replace_nan_with_inf>(uint16_t *arr, template <> bool is_a_nan(uint16_t elem) { - return (elem & 0x7c00) == 0x7c00; + return ((elem & 0x7c00u) == 0x7c00u) && + ((elem & 0x03ffu) != 0); } X86_SIMD_SORT_INLINE diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h index 89901a20..99717207 100644 --- a/src/avx512-common-qsort.h +++ b/src/avx512-common-qsort.h @@ -191,7 +191,7 @@ X86_SIMD_SORT_INLINE arrsize_t move_nans_to_end_of_array(T *arr, arrsize_t size) arrsize_t jj = size - 1; arrsize_t ii = 0; arrsize_t count = 0; - while (ii <= jj) { + while (ii < jj) { if (is_a_nan(arr[ii])) { std::swap(arr[ii], arr[jj]); jj -= 1; @@ -201,6 +201,10 @@ X86_SIMD_SORT_INLINE arrsize_t move_nans_to_end_of_array(T *arr, arrsize_t size) ii += 1; } } + /* Haven't checked for nan when ii == jj */ + if (is_a_nan(arr[ii])) { + count++; + } return size - count - 1; } diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp index 4aee3de2..7d0f0a06 100644 --- a/src/avx512fp16-16bit-qsort.hpp +++ b/src/avx512fp16-16bit-qsort.hpp @@ -145,17 +145,19 @@ struct zmm_vector<_Float16> { template <> bool is_a_nan<_Float16>(_Float16 elem) { - Fp16Bits temp; - temp.f_ = elem; - return (temp.i_ & 0x7c00) == 0x7c00; + return elem != elem; } template <> -void replace_inf_with_nan(_Float16 *arr, arrsize_t arrsize, arrsize_t nan_count) +void replace_inf_with_nan(_Float16 *arr, arrsize_t size, arrsize_t nan_count) { - memset(arr + arrsize - nan_count, 0xFF, nan_count * 2); + Fp16Bits val; + val.i_ = 0x7c01; + for (arrsize_t ii = size - 1; nan_count > 0; --ii) { + arr[ii] = val.f_; + nan_count -= 1; + } } - /* Specialized template function for _Float16 qsort_*/ template <> void avx512_qsort(_Float16 *arr, arrsize_t arrsize) @@ -169,4 +171,17 @@ void avx512_qsort(_Float16 *arr, arrsize_t arrsize) replace_inf_with_nan(arr, arrsize, nan_count); } } + +template <> +void avx512_qselect(_Float16 *arr, arrsize_t k, arrsize_t arrsize, bool hasnan) +{ + arrsize_t indx_last_elem = arrsize - 1; + if (UNLIKELY(hasnan)) { + indx_last_elem = move_nans_to_end_of_array(arr, arrsize); + } + if (indx_last_elem >= k) { + qselect_, _Float16>( + arr, k, 0, indx_last_elem, 2 * (arrsize_t)log2(indx_last_elem)); + } +} #endif // AVX512FP16_QSORT_16BIT From c74bc0e2500700aa62718967f2f8bb1d0fb964ae Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 3 Oct 2023 20:58:58 -0700 Subject: [PATCH 23/23] Split CI into 4 jobs --- .github/workflows/c-cpp.yml | 110 +++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 3 deletions(-) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index e1e102a0..762a24d2 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -7,7 +7,7 @@ on: branches: [ "main" ] jobs: - tests: + SKL: runs-on: intel-ubuntu-latest @@ -29,7 +29,7 @@ jobs: - name: Install Intel SDE run: | - curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/732268/sde-external-9.7.0-2022-05-09-lin.tar.xz + curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/784319/sde-external-9.24.0-2023-07-13-lin.tar.xz mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde @@ -38,18 +38,122 @@ jobs: CXX: g++-13 run: | make clean - meson setup --warnlevel 2 --werror --buildtype plain builddir + meson setup --warnlevel 2 --werror --buildtype release builddir cd builddir ninja - name: Run test suite on SKL run: sde -skl -- ./builddir/testexe + SKX: + + runs-on: intel-ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Install dependencies + run: | + sudo apt update + sudo apt -y install g++-13 libgtest-dev meson curl git cmake + + - name: Install google benchmarks + run: | + git clone https://github.com/google/benchmark.git + cd benchmark + cmake -E make_directory "build" + cmake -E chdir "build" cmake -DBENCHMARK_ENABLE_GTEST_TESTS=OFF -DBENCHMARK_ENABLE_TESTING=OFF -DCMAKE_BUILD_TYPE=Release ../ + sudo cmake --build "build" --config Release --target install + + - name: Install Intel SDE + run: | + curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/784319/sde-external-9.24.0-2023-07-13-lin.tar.xz + mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ + sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde + + - name: Build + env: + CXX: g++-13 + run: | + make clean + meson setup --warnlevel 2 --werror --buildtype release builddir + cd builddir + ninja + - name: Run test suite on SKX run: sde -skx -- ./builddir/testexe + TGL: + + runs-on: intel-ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Install dependencies + run: | + sudo apt update + sudo apt -y install g++-13 libgtest-dev meson curl git cmake + + - name: Install google benchmarks + run: | + git clone https://github.com/google/benchmark.git + cd benchmark + cmake -E make_directory "build" + cmake -E chdir "build" cmake -DBENCHMARK_ENABLE_GTEST_TESTS=OFF -DBENCHMARK_ENABLE_TESTING=OFF -DCMAKE_BUILD_TYPE=Release ../ + sudo cmake --build "build" --config Release --target install + + - name: Install Intel SDE + run: | + curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/784319/sde-external-9.24.0-2023-07-13-lin.tar.xz + mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ + sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde + + - name: Build + env: + CXX: g++-13 + run: | + make clean + meson setup --warnlevel 2 --werror --buildtype release builddir + cd builddir + ninja - name: Run test suite on TGL run: sde -tgl -- ./builddir/testexe + SPR: + + runs-on: intel-ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Install dependencies + run: | + sudo apt update + sudo apt -y install g++-13 libgtest-dev meson curl git cmake + + - name: Install google benchmarks + run: | + git clone https://github.com/google/benchmark.git + cd benchmark + cmake -E make_directory "build" + cmake -E chdir "build" cmake -DBENCHMARK_ENABLE_GTEST_TESTS=OFF -DBENCHMARK_ENABLE_TESTING=OFF -DCMAKE_BUILD_TYPE=Release ../ + sudo cmake --build "build" --config Release --target install + + - name: Install Intel SDE + run: | + curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/784319/sde-external-9.24.0-2023-07-13-lin.tar.xz + mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ + sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde + + - name: Build + env: + CXX: g++-13 + run: | + make clean + meson setup --warnlevel 2 --werror --buildtype release builddir + cd builddir + ninja + - name: Run test suite on SPR run: sde -spr -- ./builddir/testexe