From 67c7534bfcb277368502f1d496a6c1075542b645 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 14 Nov 2023 09:35:33 -0800 Subject: [PATCH 1/5] Add key-value sort to x86simdsort disptach --- lib/x86simdsort-internal.h | 9 +++++++ lib/x86simdsort-scalar.h | 28 ++++++++++++++++++++++ lib/x86simdsort-skx.cpp | 18 ++++++++++++++ lib/x86simdsort.cpp | 40 ++++++++++++++++++++++++++++++- lib/x86simdsort.h | 5 ++++ src/avx512-64bit-keyvaluesort.hpp | 10 ++++---- tests/meson.build | 6 +++++ 7 files changed, 111 insertions(+), 5 deletions(-) diff --git a/lib/x86simdsort-internal.h b/lib/x86simdsort-internal.h index c7ec80b2..550227e4 100644 --- a/lib/x86simdsort-internal.h +++ b/lib/x86simdsort-internal.h @@ -9,6 +9,9 @@ namespace avx512 { // quicksort template XSS_HIDE_SYMBOL void qsort(T *arr, size_t arrsize, bool hasnan = false); + // key-value quicksort + template + XSS_EXPORT_SYMBOL void keyvalue_qsort(T1 *key, T2* val, size_t arrsize, bool hasnan = false); // quickselect template XSS_HIDE_SYMBOL void @@ -30,6 +33,9 @@ namespace avx2 { // quicksort template XSS_HIDE_SYMBOL void qsort(T *arr, size_t arrsize, bool hasnan = false); + // key-value quicksort + template + XSS_EXPORT_SYMBOL void keyvalue_qsort(T1 *key, T2* val, size_t arrsize, bool hasnan = false); // quickselect template XSS_HIDE_SYMBOL void @@ -51,6 +57,9 @@ namespace scalar { // quicksort template XSS_HIDE_SYMBOL void qsort(T *arr, size_t arrsize, bool hasnan = false); + // key-value quicksort + template + XSS_EXPORT_SYMBOL void keyvalue_qsort(T1 *key, T2* val, size_t arrsize, bool hasnan = false); // quickselect template XSS_HIDE_SYMBOL void diff --git a/lib/x86simdsort-scalar.h b/lib/x86simdsort-scalar.h index b048700c..81d7e226 100644 --- a/lib/x86simdsort-scalar.h +++ b/lib/x86simdsort-scalar.h @@ -3,6 +3,27 @@ #include namespace xss { +namespace utils { +/* O(1) permute array in place: stolen from + * http://www.davidespataro.it/apply-a-permutation-to-a-vector */ +template +void apply_permutation_in_place(T* arr, std::vector arg) +{ + for(size_t i = 0 ; i < arg.size() ; i++) { + size_t curr = i; + size_t next = arg[curr]; + while(next != i) + { + std::swap(arr[curr], arr[next]); + arg[curr] = curr; + curr = next; + next = arg[next]; + } + arg[curr] = curr; + } +} +} // utils + namespace scalar { template void qsort(T *arr, size_t arrsize, bool hasnan) @@ -57,6 +78,13 @@ namespace scalar { compare_arg>(arr)); return arg; } + template + void keyvalue_qsort(T1 *key, T2* val, size_t arrsize, bool hasnan) + { + std::vector arg = argsort(key, arrsize, hasnan); + utils::apply_permutation_in_place(key, arg); + utils::apply_permutation_in_place(val, arg); + } } // namespace scalar } // namespace xss diff --git a/lib/x86simdsort-skx.cpp b/lib/x86simdsort-skx.cpp index 81c8f019..1dabfb71 100644 --- a/lib/x86simdsort-skx.cpp +++ b/lib/x86simdsort-skx.cpp @@ -1,5 +1,6 @@ // SKX specific routines: #include "avx512-32bit-qsort.hpp" +#include "avx512-64bit-keyvaluesort.hpp" #include "avx512-64bit-argsort.hpp" #include "avx512-64bit-qsort.hpp" #include "x86simdsort-internal.h" @@ -32,6 +33,14 @@ return avx512_argselect(arr, k, arrsize, hasnan); \ } +#define DEFINE_KEYVALUE_METHODS(type1, type2) \ + template <> \ + void keyvalue_qsort(type1 *key, type2* val, size_t arrsize, bool hasnan) \ + { \ + avx512_qsort_kv(key, val, arrsize, hasnan); \ + } \ + + namespace xss { namespace avx512 { DEFINE_ALL_METHODS(uint32_t) @@ -40,5 +49,14 @@ namespace avx512 { DEFINE_ALL_METHODS(uint64_t) DEFINE_ALL_METHODS(int64_t) DEFINE_ALL_METHODS(double) + DEFINE_KEYVALUE_METHODS(double, uint64_t) + DEFINE_KEYVALUE_METHODS(double, int64_t) + DEFINE_KEYVALUE_METHODS(double, double) + DEFINE_KEYVALUE_METHODS(uint64_t, uint64_t) + DEFINE_KEYVALUE_METHODS(uint64_t, int64_t) + DEFINE_KEYVALUE_METHODS(uint64_t, double) + DEFINE_KEYVALUE_METHODS(int64_t, uint64_t) + DEFINE_KEYVALUE_METHODS(int64_t, int64_t) + DEFINE_KEYVALUE_METHODS(int64_t, double) } // namespace avx512 } // namespace xss diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 0ec54bef..86caeb0e 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -51,6 +51,8 @@ dispatch_requested(std::string_view cpurequested, return false; } +namespace x86simdsort { + #define CAT_(a, b) a##b #define CAT(a, b) CAT_(a, b) @@ -120,6 +122,33 @@ dispatch_requested(std::string_view cpurequested, return; \ } \ } \ + } \ + +#define DISPATCH_KEYVALUE_SORT(TYPE1, TYPE2, ISA) \ + static void (CAT(CAT(*internal_kv_qsort_, TYPE1), TYPE2))(TYPE1*, TYPE2*, size_t, bool) = NULL; \ + template <> \ + void keyvalue_qsort(TYPE1 *key, TYPE2* val, size_t arrsize, bool hasnan) \ + { \ + (CAT(CAT(*internal_kv_qsort_, TYPE1), TYPE2))(key, val, arrsize, hasnan); \ + } \ + static __attribute__((constructor)) void \ + CAT(CAT(resolve_keyvalue_qsort_, TYPE1), TYPE2)(void) \ + { \ + CAT(CAT(internal_kv_qsort_, TYPE1), TYPE2) = &xss::scalar::keyvalue_qsort; \ + __builtin_cpu_init(); \ + std::string_view preferred_cpu = find_preferred_cpu(ISA); \ + if constexpr (dispatch_requested("avx512", ISA)) { \ + if (preferred_cpu.find("avx512") != std::string_view::npos) { \ + CAT(CAT(internal_kv_qsort_, TYPE1), TYPE2) = &xss::avx512::keyvalue_qsort; \ + return; \ + } \ + } \ + if constexpr (dispatch_requested("avx2", ISA)) { \ + if (preferred_cpu.find("avx2") != std::string_view::npos) { \ + CAT(CAT(internal_kv_qsort_, TYPE1), TYPE2) = &xss::avx2::keyvalue_qsort; \ + return; \ + } \ + } \ } #define ISA_LIST(...) \ @@ -128,7 +157,6 @@ dispatch_requested(std::string_view cpurequested, __VA_ARGS__ \ } -namespace x86simdsort { #ifdef __FLT16_MAX__ DISPATCH(qsort, _Float16, ISA_LIST("avx512_spr")) DISPATCH(qselect, _Float16, ISA_LIST("avx512_spr")) @@ -168,4 +196,14 @@ DISPATCH_ALL(argselect, (ISA_LIST("avx512_skx")), (ISA_LIST("avx512_skx"))) +DISPATCH_KEYVALUE_SORT(uint64_t, int64_t, (ISA_LIST("avx512_skx"))) +DISPATCH_KEYVALUE_SORT(uint64_t, uint64_t, (ISA_LIST("avx512_skx"))) +DISPATCH_KEYVALUE_SORT(uint64_t, double, (ISA_LIST("avx512_skx"))) +DISPATCH_KEYVALUE_SORT(int64_t, int64_t, (ISA_LIST("avx512_skx"))) +DISPATCH_KEYVALUE_SORT(int64_t, uint64_t, (ISA_LIST("avx512_skx"))) +DISPATCH_KEYVALUE_SORT(int64_t, double, (ISA_LIST("avx512_skx"))) +DISPATCH_KEYVALUE_SORT(double, int64_t, (ISA_LIST("avx512_skx"))) +DISPATCH_KEYVALUE_SORT(double, double, (ISA_LIST("avx512_skx"))) +DISPATCH_KEYVALUE_SORT(double, uint64_t, (ISA_LIST("avx512_skx"))) + } // namespace x86simdsort diff --git a/lib/x86simdsort.h b/lib/x86simdsort.h index 738a2a15..907ba43d 100644 --- a/lib/x86simdsort.h +++ b/lib/x86simdsort.h @@ -34,5 +34,10 @@ template XSS_EXPORT_SYMBOL std::vector argselect(T *arr, size_t k, size_t arrsize, bool hasnan = false); +// argselect +template +XSS_EXPORT_SYMBOL void +keyvalue_qsort(T1 *key, T2* val, size_t arrsize, bool hasnan = false); + } // namespace x86simdsort #endif diff --git a/src/avx512-64bit-keyvaluesort.hpp b/src/avx512-64bit-keyvaluesort.hpp index b1ec0cd2..8281d2db 100644 --- a/src/avx512-64bit-keyvaluesort.hpp +++ b/src/avx512-64bit-keyvaluesort.hpp @@ -542,8 +542,8 @@ heapify(type1_t *keys, type2_t *indexes, arrsize_t idx, arrsize_t size) arrsize_t i = idx; while (true) { arrsize_t j = 2 * i + 1; - if (j >= size || j < 0) { break; } - int k = j + 1; + if (j >= size) { break; } + arrsize_t k = j + 1; if (k < size && keys[j] < keys[k]) { j = k; } if (keys[j] < keys[i]) { break; } std::swap(keys[i], keys[j]); @@ -558,8 +558,9 @@ template = 0; i--) { + for (arrsize_t i = size / 2 - 1; ; i--) { heapify(keys, indexes, i, size); + if (i == 0) { break; } } for (arrsize_t i = size - 1; i > 0; i--) { std::swap(keys[0], keys[i]); @@ -614,8 +615,9 @@ X86_SIMD_SORT_INLINE void qsort_64bit_(type1_t *keys, template X86_SIMD_SORT_INLINE void -avx512_qsort_kv(T1 *keys, T2 *indexes, arrsize_t arrsize) +avx512_qsort_kv(T1 *keys, T2 *indexes, arrsize_t arrsize, bool hasnan = false) { + UNUSED(hasnan); if (arrsize > 1) { if constexpr (std::is_floating_point_v) { arrsize_t nan_count diff --git a/tests/meson.build b/tests/meson.build index 172ddf01..ef7de312 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -6,6 +6,12 @@ libtests += static_library('tests_qsort', include_directories : [lib, utils], ) +libtests += static_library('tests_kvsort', + files('test-keyvalue.cpp', ), + dependencies: gtest_dep, + include_directories : [lib, utils], + ) + #if cancompilefp16 # libtests += static_library('tests_qsortfp16', # files('test-qsortfp16.cpp', ), From 22d26362c5d6b2c7dc57a29a99ec28d17ea1331b Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Thu, 16 Nov 2023 11:50:18 -0800 Subject: [PATCH 2/5] make custom float functions static --- utils/custom-float.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/utils/custom-float.h b/utils/custom-float.h index 001d4245..291912b9 100644 --- a/utils/custom-float.h +++ b/utils/custom-float.h @@ -8,32 +8,32 @@ namespace fp inline constexpr bool is_floating_point_v = std::is_floating_point_v; template - bool isnan(T elem) + static bool isnan(T elem) { return std::isnan(elem); } template - bool isunordered(T a, T b) + static bool isunordered(T a, T b) { return std::isunordered(a, b); } template - T max() + static T max() { return std::numeric_limits::max(); } template - T min() + static T min() { return std::numeric_limits::min(); } template - T infinity() + static T infinity() { return std::numeric_limits::infinity(); } template - T quiet_NaN() + static T quiet_NaN() { return std::numeric_limits::quiet_NaN(); } @@ -44,7 +44,7 @@ namespace fp uint16_t i_; } Fp16Bits; - _Float16 convert_bits(uint16_t val) + static _Float16 convert_bits(uint16_t val) { Fp16Bits temp; temp.i_ = val; @@ -52,35 +52,35 @@ namespace fp } template <> - inline constexpr bool is_floating_point_v<_Float16> = true; + [[maybe_unused]] inline constexpr bool is_floating_point_v<_Float16> = true; template <> - bool isnan<_Float16>(_Float16 elem) + [[maybe_unused]] bool isnan<_Float16>(_Float16 elem) { return elem != elem; } template <> - bool isunordered<_Float16>(_Float16 a, _Float16 b) + [[maybe_unused]] bool isunordered<_Float16>(_Float16 a, _Float16 b) { return isnan(a) || isnan(b); } template <> - _Float16 max<_Float16>() + [[maybe_unused]] _Float16 max<_Float16>() { return convert_bits(0x7bff); } template <> - _Float16 min<_Float16>() + [[maybe_unused]] _Float16 min<_Float16>() { return convert_bits(0x0400); } template <> - _Float16 infinity<_Float16>() + [[maybe_unused]] _Float16 infinity<_Float16>() { return convert_bits(0x7c00); } template <> - _Float16 quiet_NaN<_Float16>() + [[maybe_unused]] _Float16 quiet_NaN<_Float16>() { return convert_bits(0x7c01); } From 98fdfe3057b30a00a7187a236cd6f84c89b9ab47 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Thu, 16 Nov 2023 13:18:10 -0800 Subject: [PATCH 3/5] Add tests for keyvalue --- tests/test-keyvalue.cpp | 64 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 tests/test-keyvalue.cpp diff --git a/tests/test-keyvalue.cpp b/tests/test-keyvalue.cpp new file mode 100644 index 00000000..02900ff7 --- /dev/null +++ b/tests/test-keyvalue.cpp @@ -0,0 +1,64 @@ +/******************************************* + * * Copyright (C) 2022-2023 Intel Corporation + * * SPDX-License-Identifier: BSD-3-Clause + * *******************************************/ + +#include "rand_array.h" +#include "x86simdsort.h" +#include "x86simdsort-scalar.h" +#include + +template +class simdkvsort : public ::testing::Test { +public: + simdkvsort() + { + std::iota(arrsize.begin(), arrsize.end(), 1); + arrtype = {"random", + "constant", + "sorted", + "reverse", + "smallrange", + "max_at_the_end", + "rand_max"}; + } + std::vector arrtype; + std::vector arrsize = std::vector(1024); +}; + +TYPED_TEST_SUITE_P(simdkvsort); + +TYPED_TEST_P(simdkvsort, test_kvsort) +{ + using T1 = typename std::tuple_element<0, decltype(TypeParam())>::type; + using T2 = typename std::tuple_element<1, decltype(TypeParam())>::type; + for (auto type : this->arrtype) { + bool hasnan = (type == "rand_with_nan") ? true : false; + for (auto size : this->arrsize) { + std::vector v1 = get_array(type, size); + std::vector v2 = get_array(type, size); + std::vector v1_bckp = v1; + std::vector v2_bckp = v2; + xss::scalar::keyvalue_qsort(v1_bckp.data(), v2_bckp.data(), size, hasnan); + x86simdsort::keyvalue_qsort(v1.data(), v2.data(), size, hasnan); + ASSERT_EQ(v1, v1_bckp); + ASSERT_EQ(v2, v2_bckp); + v1.clear(); v2.clear(); + v1_bckp.clear(); v2_bckp.clear(); + } + } +} + +REGISTER_TYPED_TEST_SUITE_P(simdkvsort, test_kvsort); + +using QKVSortTestTypes = testing::Types, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple>; + +INSTANTIATE_TYPED_TEST_SUITE_P(xss, simdkvsort, QKVSortTestTypes); From b8600a46bfb10d5e315ea204a1e2d63ba4e9b692 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 17 Nov 2023 12:05:05 -0800 Subject: [PATCH 4/5] Update ky-value test to handle duplicate entires in key vector --- tests/test-keyvalue.cpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tests/test-keyvalue.cpp b/tests/test-keyvalue.cpp index 02900ff7..3a73c08e 100644 --- a/tests/test-keyvalue.cpp +++ b/tests/test-keyvalue.cpp @@ -35,16 +35,19 @@ TYPED_TEST_P(simdkvsort, test_kvsort) for (auto type : this->arrtype) { bool hasnan = (type == "rand_with_nan") ? true : false; for (auto size : this->arrsize) { - std::vector v1 = get_array(type, size); - std::vector v2 = get_array(type, size); - std::vector v1_bckp = v1; - std::vector v2_bckp = v2; - xss::scalar::keyvalue_qsort(v1_bckp.data(), v2_bckp.data(), size, hasnan); - x86simdsort::keyvalue_qsort(v1.data(), v2.data(), size, hasnan); - ASSERT_EQ(v1, v1_bckp); - ASSERT_EQ(v2, v2_bckp); - v1.clear(); v2.clear(); - v1_bckp.clear(); v2_bckp.clear(); + std::vector key = get_array(type, size); + std::vector val = get_array(type, size); + std::vector key_bckp = key; + std::vector val_bckp = val; + x86simdsort::keyvalue_qsort(key.data(), val.data(), size, hasnan); + xss::scalar::keyvalue_qsort(key_bckp.data(), val_bckp.data(), size, hasnan); + ASSERT_EQ(key, key_bckp); + const bool hasDuplicates = std::adjacent_find(key.begin(), key.end()) != key.end(); + if (!hasDuplicates) { + ASSERT_EQ(val, val_bckp); + } + key.clear(); val.clear(); + key_bckp.clear(); val_bckp.clear(); } } } From aba837133f3ef952065ed63e0e028703b9d0bde7 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 17 Nov 2023 12:15:28 -0800 Subject: [PATCH 5/5] Add benchmarks for keyvalue sort --- benchmarks/bench-all.cpp | 1 + benchmarks/bench-keyvalue.hpp | 48 +++++++++++++++++++++++++++++++++++ run-bench.py | 3 +++ 3 files changed, 52 insertions(+) create mode 100644 benchmarks/bench-keyvalue.hpp diff --git a/benchmarks/bench-all.cpp b/benchmarks/bench-all.cpp index 23fc17a0..b51b5fb4 100644 --- a/benchmarks/bench-all.cpp +++ b/benchmarks/bench-all.cpp @@ -44,3 +44,4 @@ #include "bench-partial-qsort.hpp" #include "bench-qselect.hpp" #include "bench-qsort.hpp" +#include "bench-keyvalue.hpp" diff --git a/benchmarks/bench-keyvalue.hpp b/benchmarks/bench-keyvalue.hpp new file mode 100644 index 00000000..101a8fae --- /dev/null +++ b/benchmarks/bench-keyvalue.hpp @@ -0,0 +1,48 @@ +#include "x86simdsort-scalar.h" + +template +static void scalarkvsort(benchmark::State &state, Args &&...args) +{ + // Get args + auto args_tuple = std::make_tuple(std::move(args)...); + size_t arrsize = std::get<0>(args_tuple); + std::string arrtype = std::get<1>(args_tuple); + // set up array + std::vector key = get_array(arrtype, arrsize); + std::vector val = get_array("random", arrsize); + std::vector key_bkp = key; + // benchmark + for (auto _ : state) { + xss::scalar::keyvalue_qsort(key.data(), val.data(), arrsize, false); + state.PauseTiming(); + key = key_bkp; + state.ResumeTiming(); + } +} + +template +static void simdkvsort(benchmark::State &state, Args &&...args) +{ + auto args_tuple = std::make_tuple(std::move(args)...); + size_t arrsize = std::get<0>(args_tuple); + std::string arrtype = std::get<1>(args_tuple); + // set up array + std::vector key = get_array(arrtype, arrsize); + std::vector val = get_array("random", arrsize); + std::vector key_bkp = key; + // benchmark + for (auto _ : state) { + x86simdsort::keyvalue_qsort(key.data(), val.data(), arrsize); + state.PauseTiming(); + key = key_bkp; + state.ResumeTiming(); + } +} + +#define BENCH_BOTH_KVSORT(type) \ + BENCH_SORT(simdkvsort, type) \ + BENCH_SORT(scalarkvsort, type) + +BENCH_BOTH_KVSORT(uint64_t) +BENCH_BOTH_KVSORT(int64_t) +BENCH_BOTH_KVSORT(double) diff --git a/run-bench.py b/run-bench.py index 04946393..6a14fa18 100644 --- a/run-bench.py +++ b/run-bench.py @@ -31,6 +31,9 @@ elif "argsort" in args.benchcompare: baseline = "scalarargsort.*" + filterb contender = "simdargsort.*" + filterb + elif "keyvalue" in args.benchcompare: + baseline = "scalarkvsort.*" + filterb + contender = "simdkvsort.*" + filterb else: parser.print_help(sys.stderr) parser.error("ERROR: Unknown argument '%s'" % args.benchcompare)