From 781700f079896b920f373e3fae2ed3a82359f85a Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty@gmail.com>
Date: Thu, 11 Jul 2024 14:06:38 +0200
Subject: [PATCH] [libc][math] Optimize generic nearest integer functions

---
 .../FPUtil/NearestIntegerOperations.h         |  37 ++--
 .../math/performance_testing/CMakeLists.txt   |  19 ++
 .../nearest_integer_funcs_perf.cpp            | 168 ++++++++++++++++++
 3 files changed, 208 insertions(+), 16 deletions(-)
 create mode 100644 libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp
diff --git a/libc/src/__support/FPUtil/NearestIntegerOperations.h b/libc/src/__support/FPUtil/NearestIntegerOperations.h
index cff32938229d0..a9a0a97eebb5c 100644
--- a/libc/src/__support/FPUtil/NearestIntegerOperations.h
+++ b/libc/src/__support/FPUtil/NearestIntegerOperations.h
@@ -75,15 +75,17 @@ LIBC_INLINE T ceil(T x) {
   }
 
   uint32_t trim_size = FPBits<T>::FRACTION_LEN - exponent;
-  StorageType trunc_mantissa =
-      static_cast<StorageType>((bits.get_mantissa() >> trim_size) << trim_size);
-  bits.set_mantissa(trunc_mantissa);
-  T trunc_value = bits.get_val();
+  StorageType x_u = bits.uintval();
+  StorageType trunc_u =
+      static_cast<StorageType>((x_u >> trim_size) << trim_size);
 
   // If x is already an integer, return it.
-  if (trunc_value == x)
+  if (trunc_u == x_u)
     return x;
 
+  bits.set_uintval(trunc_u);
+  T trunc_value = bits.get_val();
+
   // If x is negative, the ceil operation is equivalent to the trunc operation.
   if (is_neg)
     return trunc_value;
@@ -130,15 +132,17 @@ LIBC_INLINE T round(T x) {
   uint32_t trim_size = FPBits<T>::FRACTION_LEN - exponent;
   bool half_bit_set =
       bool(bits.get_mantissa() & (StorageType(1) << (trim_size - 1)));
-  StorageType trunc_mantissa =
-      static_cast<StorageType>((bits.get_mantissa() >> trim_size) << trim_size);
-  bits.set_mantissa(trunc_mantissa);
-  T trunc_value = bits.get_val();
+  StorageType x_u = bits.uintval();
+  StorageType trunc_u =
+      static_cast<StorageType>((x_u >> trim_size) << trim_size);
 
   // If x is already an integer, return it.
-  if (trunc_value == x)
+  if (trunc_u == x_u)
     return x;
 
+  bits.set_uintval(trunc_u);
+  T trunc_value = bits.get_val();
+
   if (!half_bit_set) {
     // Franctional part is less than 0.5 so round value is the
     // same as the trunc value.
@@ -188,16 +192,17 @@ round_using_specific_rounding_mode(T x, int rnd) {
   }
 
   uint32_t trim_size = FPBits<T>::FRACTION_LEN - exponent;
-  FPBits<T> new_bits = bits;
-  StorageType trunc_mantissa =
-      static_cast<StorageType>((bits.get_mantissa() >> trim_size) << trim_size);
-  new_bits.set_mantissa(trunc_mantissa);
-  T trunc_value = new_bits.get_val();
+  StorageType x_u = bits.uintval();
+  StorageType trunc_u =
+      static_cast<StorageType>((x_u >> trim_size) << trim_size);
 
   // If x is already an integer, return it.
-  if (trunc_value == x)
+  if (trunc_u == x_u)
     return x;
 
+  FPBits<T> new_bits(trunc_u);
+  T trunc_value = new_bits.get_val();
+
   StorageType trim_value =
       bits.get_mantissa() &
       static_cast<StorageType>(((StorageType(1) << trim_size) - 1));
diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt
index 4ea78f9999e4d..bf88fbb85c5d7 100644
--- a/libc/test/src/math/performance_testing/CMakeLists.txt
+++ b/libc/test/src/math/performance_testing/CMakeLists.txt
@@ -366,3 +366,22 @@ add_perf_binary(
   COMPILE_OPTIONS
     -fno-builtin
 )
+
+add_perf_binary(
+  nearest_integer_funcs_perf
+  SRCS
+    nearest_integer_funcs_perf.cpp
+  DEPENDS
+    libc.src.math.ceilf
+    libc.src.math.ceilf16
+    libc.src.math.floorf
+    libc.src.math.floorf16
+    libc.src.math.roundevenf
+    libc.src.math.roundevenf16
+    libc.src.math.roundf
+    libc.src.math.roundf16
+    libc.src.math.truncf
+    libc.src.math.truncf16
+  COMPILE_OPTIONS
+    -fno-builtin
+)
diff --git a/libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp b/libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp
new file mode 100644
index 0000000000000..24176a377e9d4
--- /dev/null
+++ b/libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp
@@ -0,0 +1,168 @@
+//===-- Performance test for nearest integer functions --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/math/ceilf.h"
+#include "src/math/ceilf16.h"
+#include "src/math/floorf.h"
+#include "src/math/floorf16.h"
+#include "src/math/roundevenf.h"
+#include "src/math/roundevenf16.h"
+#include "src/math/roundf.h"
+#include "src/math/roundf16.h"
+#include "src/math/truncf.h"
+#include "src/math/truncf16.h"
+#include "test/src/math/performance_testing/Timer.h"
+
+#include <fstream>
+#include <math.h>
+
+namespace LIBC_NAMESPACE::testing {
+
+template <typename T> class NearestIntegerPerf {
+  using FPBits = fputil::FPBits<T>;
+  using StorageType = typename FPBits::StorageType;
+
+public:
+  typedef T Func(T);
+
+  static void run_perf_in_range(Func my_func, Func other_func,
+                                StorageType starting_bit,
+                                StorageType ending_bit, StorageType step,
+                                size_t rounds, std::ofstream &log) {
+    auto runner = [=](Func func) {
+      volatile T result;
+      for (size_t i = 0; i < rounds; i++) {
+        for (StorageType bits = starting_bit; bits <= ending_bit;
+             bits += step) {
+          T x = FPBits(bits).get_val();
+          result = func(x);
+        }
+      }
+    };
+
+    Timer timer;
+    timer.start();
+    runner(my_func);
+    timer.stop();
+
+    size_t number_of_runs = (ending_bit - starting_bit) / step + 1;
+    double my_average =
+        static_cast<double>(timer.nanoseconds()) / number_of_runs / rounds;
+    log << "-- My function --\n";
+    log << "     Total time      : " << timer.nanoseconds() << " ns \n";
+    log << "     Average runtime : " << my_average << " ns/op \n";
+    log << "     Ops per second  : "
+        << static_cast<uint64_t>(1'000'000'000.0 / my_average) << " op/s \n";
+
+    timer.start();
+    runner(other_func);
+    timer.stop();
+
+    double other_average =
+        static_cast<double>(timer.nanoseconds()) / number_of_runs / rounds;
+    log << "-- Other function --\n";
+    log << "     Total time      : " << timer.nanoseconds() << " ns \n";
+    log << "     Average runtime : " << other_average << " ns/op \n";
+    log << "     Ops per second  : "
+        << static_cast<uint64_t>(1'000'000'000.0 / other_average) << " op/s \n";
+
+    log << "-- Average runtime ratio --\n";
+    log << "     Mine / Other's  : " << my_average / other_average << " \n";
+  }
+
+  static void run_perf(Func my_func, Func other_func, size_t rounds,
+                       const char *log_file) {
+    std::ofstream log(log_file);
+    log << "Performance tests with inputs in normal integral range:\n";
+    run_perf_in_range(
+        my_func, other_func,
+        /*starting_bit=*/StorageType((FPBits::EXP_BIAS + 1) << FPBits::SIG_LEN),
+        /*ending_bit=*/
+        StorageType((FPBits::EXP_BIAS + FPBits::FRACTION_LEN - 1)
+                    << FPBits::SIG_LEN),
+        /*step=*/StorageType(1 << FPBits::SIG_LEN),
+        rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS * 2, log);
+    log << "\n Performance tests with inputs in low integral range:\n";
+    run_perf_in_range(
+        my_func, other_func,
+        /*starting_bit=*/StorageType(1 << FPBits::SIG_LEN),
+        /*ending_bit=*/StorageType((FPBits::EXP_BIAS - 1) << FPBits::SIG_LEN),
+        /*step_bit=*/StorageType(1 << FPBits::SIG_LEN),
+        rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS * 2, log);
+    log << "\n Performance tests with inputs in high integral range:\n";
+    run_perf_in_range(
+        my_func, other_func,
+        /*starting_bit=*/
+        StorageType((FPBits::EXP_BIAS + FPBits::FRACTION_LEN)
+                    << FPBits::SIG_LEN),
+        /*ending_bit=*/
+        StorageType(FPBits::MAX_BIASED_EXPONENT << FPBits::SIG_LEN),
+        /*step=*/StorageType(1 << FPBits::SIG_LEN),
+        rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS * 2, log);
+    log << "\n Performance tests with inputs in normal fractional range:\n";
+    run_perf_in_range(
+        my_func, other_func,
+        /*starting_bit=*/
+        StorageType(((FPBits::EXP_BIAS + 1) << FPBits::SIG_LEN) + 1),
+        /*ending_bit=*/
+        StorageType(((FPBits::EXP_BIAS + 2) << FPBits::SIG_LEN) - 1),
+        /*step=*/StorageType(1), rounds * 2, log);
+    log << "\n Performance tests with inputs in subnormal fractional range:\n";
+    run_perf_in_range(my_func, other_func, /*starting_bit=*/StorageType(1),
+                      /*ending_bit=*/StorageType(FPBits::SIG_MASK),
+                      /*step=*/StorageType(1), rounds, log);
+  }
+};
+
+} // namespace LIBC_NAMESPACE::testing
+
+#define NEAREST_INTEGER_PERF(T, my_func, other_func, rounds, filename)         \
+  {                                                                            \
+    LIBC_NAMESPACE::testing::NearestIntegerPerf<T>::run_perf(                  \
+        &my_func, &other_func, rounds, filename);                              \
+    LIBC_NAMESPACE::testing::NearestIntegerPerf<T>::run_perf(                  \
+        &my_func, &other_func, rounds, filename);                              \
+  }
+
+static constexpr size_t FLOAT16_ROUNDS = 20'000;
+static constexpr size_t FLOAT_ROUNDS = 40;
+
+// LLVM libc might be the only libc implementation with support for float16 math
+// functions currently. We can't compare our float16 functions against the
+// system libc, so we compare them against this placeholder function.
+float16 placeholderf16(float16 x) { return x; }
+
+// The system libc might not provide the roundeven* C23 math functions either.
+float placeholderf(float x) { return x; }
+
+int main() {
+  NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::ceilf16, ::placeholderf16,
+                       FLOAT16_ROUNDS, "ceilf16_perf.log")
+  NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::floorf16, ::placeholderf16,
+                       FLOAT16_ROUNDS, "floorf16_perf.log")
+  NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::roundevenf16, ::placeholderf16,
+                       FLOAT16_ROUNDS, "roundevenf16_perf.log")
+  NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::roundf16, ::placeholderf16,
+                       FLOAT16_ROUNDS, "roundf16_perf.log")
+  NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::truncf16, ::placeholderf16,
+                       FLOAT16_ROUNDS, "truncf16_perf.log")
+
+  NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::ceilf, ::ceilf, FLOAT_ROUNDS,
+                       "ceilf_perf.log")
+  NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::floorf, ::floorf, FLOAT_ROUNDS,
+                       "floorf_perf.log")
+  NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::roundevenf, ::placeholderf,
+                       FLOAT_ROUNDS, "roundevenf_perf.log")
+  NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::roundf, ::roundf, FLOAT_ROUNDS,
+                       "roundf_perf.log")
+  NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::truncf, ::truncf, FLOAT_ROUNDS,
+                       "truncf_perf.log")
+
+  return 0;
+}