From 781700f079896b920f373e3fae2ed3a82359f85a Mon Sep 17 00:00:00 2001 From: OverMighty Date: Thu, 11 Jul 2024 14:06:38 +0200 Subject: [PATCH] [libc][math] Optimize generic nearest integer functions --- .../FPUtil/NearestIntegerOperations.h | 37 ++-- .../math/performance_testing/CMakeLists.txt | 19 ++ .../nearest_integer_funcs_perf.cpp | 168 ++++++++++++++++++ 3 files changed, 208 insertions(+), 16 deletions(-) create mode 100644 libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp diff --git a/libc/src/__support/FPUtil/NearestIntegerOperations.h b/libc/src/__support/FPUtil/NearestIntegerOperations.h index cff32938229d0..a9a0a97eebb5c 100644 --- a/libc/src/__support/FPUtil/NearestIntegerOperations.h +++ b/libc/src/__support/FPUtil/NearestIntegerOperations.h @@ -75,15 +75,17 @@ LIBC_INLINE T ceil(T x) { } uint32_t trim_size = FPBits::FRACTION_LEN - exponent; - StorageType trunc_mantissa = - static_cast((bits.get_mantissa() >> trim_size) << trim_size); - bits.set_mantissa(trunc_mantissa); - T trunc_value = bits.get_val(); + StorageType x_u = bits.uintval(); + StorageType trunc_u = + static_cast((x_u >> trim_size) << trim_size); // If x is already an integer, return it. - if (trunc_value == x) + if (trunc_u == x_u) return x; + bits.set_uintval(trunc_u); + T trunc_value = bits.get_val(); + // If x is negative, the ceil operation is equivalent to the trunc operation. if (is_neg) return trunc_value; @@ -130,15 +132,17 @@ LIBC_INLINE T round(T x) { uint32_t trim_size = FPBits::FRACTION_LEN - exponent; bool half_bit_set = bool(bits.get_mantissa() & (StorageType(1) << (trim_size - 1))); - StorageType trunc_mantissa = - static_cast((bits.get_mantissa() >> trim_size) << trim_size); - bits.set_mantissa(trunc_mantissa); - T trunc_value = bits.get_val(); + StorageType x_u = bits.uintval(); + StorageType trunc_u = + static_cast((x_u >> trim_size) << trim_size); // If x is already an integer, return it. - if (trunc_value == x) + if (trunc_u == x_u) return x; + bits.set_uintval(trunc_u); + T trunc_value = bits.get_val(); + if (!half_bit_set) { // Franctional part is less than 0.5 so round value is the // same as the trunc value. @@ -188,16 +192,17 @@ round_using_specific_rounding_mode(T x, int rnd) { } uint32_t trim_size = FPBits::FRACTION_LEN - exponent; - FPBits new_bits = bits; - StorageType trunc_mantissa = - static_cast((bits.get_mantissa() >> trim_size) << trim_size); - new_bits.set_mantissa(trunc_mantissa); - T trunc_value = new_bits.get_val(); + StorageType x_u = bits.uintval(); + StorageType trunc_u = + static_cast((x_u >> trim_size) << trim_size); // If x is already an integer, return it. - if (trunc_value == x) + if (trunc_u == x_u) return x; + FPBits new_bits(trunc_u); + T trunc_value = new_bits.get_val(); + StorageType trim_value = bits.get_mantissa() & static_cast(((StorageType(1) << trim_size) - 1)); diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt index 4ea78f9999e4d..bf88fbb85c5d7 100644 --- a/libc/test/src/math/performance_testing/CMakeLists.txt +++ b/libc/test/src/math/performance_testing/CMakeLists.txt @@ -366,3 +366,22 @@ add_perf_binary( COMPILE_OPTIONS -fno-builtin ) + +add_perf_binary( + nearest_integer_funcs_perf + SRCS + nearest_integer_funcs_perf.cpp + DEPENDS + libc.src.math.ceilf + libc.src.math.ceilf16 + libc.src.math.floorf + libc.src.math.floorf16 + libc.src.math.roundevenf + libc.src.math.roundevenf16 + libc.src.math.roundf + libc.src.math.roundf16 + libc.src.math.truncf + libc.src.math.truncf16 + COMPILE_OPTIONS + -fno-builtin +) diff --git a/libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp b/libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp new file mode 100644 index 0000000000000..24176a377e9d4 --- /dev/null +++ b/libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp @@ -0,0 +1,168 @@ +//===-- Performance test for nearest integer functions --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/FPUtil/FPBits.h" +#include "src/math/ceilf.h" +#include "src/math/ceilf16.h" +#include "src/math/floorf.h" +#include "src/math/floorf16.h" +#include "src/math/roundevenf.h" +#include "src/math/roundevenf16.h" +#include "src/math/roundf.h" +#include "src/math/roundf16.h" +#include "src/math/truncf.h" +#include "src/math/truncf16.h" +#include "test/src/math/performance_testing/Timer.h" + +#include +#include + +namespace LIBC_NAMESPACE::testing { + +template class NearestIntegerPerf { + using FPBits = fputil::FPBits; + using StorageType = typename FPBits::StorageType; + +public: + typedef T Func(T); + + static void run_perf_in_range(Func my_func, Func other_func, + StorageType starting_bit, + StorageType ending_bit, StorageType step, + size_t rounds, std::ofstream &log) { + auto runner = [=](Func func) { + volatile T result; + for (size_t i = 0; i < rounds; i++) { + for (StorageType bits = starting_bit; bits <= ending_bit; + bits += step) { + T x = FPBits(bits).get_val(); + result = func(x); + } + } + }; + + Timer timer; + timer.start(); + runner(my_func); + timer.stop(); + + size_t number_of_runs = (ending_bit - starting_bit) / step + 1; + double my_average = + static_cast(timer.nanoseconds()) / number_of_runs / rounds; + log << "-- My function --\n"; + log << " Total time : " << timer.nanoseconds() << " ns \n"; + log << " Average runtime : " << my_average << " ns/op \n"; + log << " Ops per second : " + << static_cast(1'000'000'000.0 / my_average) << " op/s \n"; + + timer.start(); + runner(other_func); + timer.stop(); + + double other_average = + static_cast(timer.nanoseconds()) / number_of_runs / rounds; + log << "-- Other function --\n"; + log << " Total time : " << timer.nanoseconds() << " ns \n"; + log << " Average runtime : " << other_average << " ns/op \n"; + log << " Ops per second : " + << static_cast(1'000'000'000.0 / other_average) << " op/s \n"; + + log << "-- Average runtime ratio --\n"; + log << " Mine / Other's : " << my_average / other_average << " \n"; + } + + static void run_perf(Func my_func, Func other_func, size_t rounds, + const char *log_file) { + std::ofstream log(log_file); + log << "Performance tests with inputs in normal integral range:\n"; + run_perf_in_range( + my_func, other_func, + /*starting_bit=*/StorageType((FPBits::EXP_BIAS + 1) << FPBits::SIG_LEN), + /*ending_bit=*/ + StorageType((FPBits::EXP_BIAS + FPBits::FRACTION_LEN - 1) + << FPBits::SIG_LEN), + /*step=*/StorageType(1 << FPBits::SIG_LEN), + rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS * 2, log); + log << "\n Performance tests with inputs in low integral range:\n"; + run_perf_in_range( + my_func, other_func, + /*starting_bit=*/StorageType(1 << FPBits::SIG_LEN), + /*ending_bit=*/StorageType((FPBits::EXP_BIAS - 1) << FPBits::SIG_LEN), + /*step_bit=*/StorageType(1 << FPBits::SIG_LEN), + rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS * 2, log); + log << "\n Performance tests with inputs in high integral range:\n"; + run_perf_in_range( + my_func, other_func, + /*starting_bit=*/ + StorageType((FPBits::EXP_BIAS + FPBits::FRACTION_LEN) + << FPBits::SIG_LEN), + /*ending_bit=*/ + StorageType(FPBits::MAX_BIASED_EXPONENT << FPBits::SIG_LEN), + /*step=*/StorageType(1 << FPBits::SIG_LEN), + rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS * 2, log); + log << "\n Performance tests with inputs in normal fractional range:\n"; + run_perf_in_range( + my_func, other_func, + /*starting_bit=*/ + StorageType(((FPBits::EXP_BIAS + 1) << FPBits::SIG_LEN) + 1), + /*ending_bit=*/ + StorageType(((FPBits::EXP_BIAS + 2) << FPBits::SIG_LEN) - 1), + /*step=*/StorageType(1), rounds * 2, log); + log << "\n Performance tests with inputs in subnormal fractional range:\n"; + run_perf_in_range(my_func, other_func, /*starting_bit=*/StorageType(1), + /*ending_bit=*/StorageType(FPBits::SIG_MASK), + /*step=*/StorageType(1), rounds, log); + } +}; + +} // namespace LIBC_NAMESPACE::testing + +#define NEAREST_INTEGER_PERF(T, my_func, other_func, rounds, filename) \ + { \ + LIBC_NAMESPACE::testing::NearestIntegerPerf::run_perf( \ + &my_func, &other_func, rounds, filename); \ + LIBC_NAMESPACE::testing::NearestIntegerPerf::run_perf( \ + &my_func, &other_func, rounds, filename); \ + } + +static constexpr size_t FLOAT16_ROUNDS = 20'000; +static constexpr size_t FLOAT_ROUNDS = 40; + +// LLVM libc might be the only libc implementation with support for float16 math +// functions currently. We can't compare our float16 functions against the +// system libc, so we compare them against this placeholder function. +float16 placeholderf16(float16 x) { return x; } + +// The system libc might not provide the roundeven* C23 math functions either. +float placeholderf(float x) { return x; } + +int main() { + NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::ceilf16, ::placeholderf16, + FLOAT16_ROUNDS, "ceilf16_perf.log") + NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::floorf16, ::placeholderf16, + FLOAT16_ROUNDS, "floorf16_perf.log") + NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::roundevenf16, ::placeholderf16, + FLOAT16_ROUNDS, "roundevenf16_perf.log") + NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::roundf16, ::placeholderf16, + FLOAT16_ROUNDS, "roundf16_perf.log") + NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::truncf16, ::placeholderf16, + FLOAT16_ROUNDS, "truncf16_perf.log") + + NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::ceilf, ::ceilf, FLOAT_ROUNDS, + "ceilf_perf.log") + NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::floorf, ::floorf, FLOAT_ROUNDS, + "floorf_perf.log") + NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::roundevenf, ::placeholderf, + FLOAT_ROUNDS, "roundevenf_perf.log") + NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::roundf, ::roundf, FLOAT_ROUNDS, + "roundf_perf.log") + NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::truncf, ::truncf, FLOAT_ROUNDS, + "truncf_perf.log") + + return 0; +}