11#include " LibcGpuBenchmark.h"
22#include " src/__support/CPP/algorithm.h"
33#include " src/__support/CPP/array.h"
4+ #include " src/__support/CPP/atomic.h"
45#include " src/__support/CPP/string.h"
56#include " src/__support/FPUtil/sqrt.h"
67#include " src/__support/GPU/utils.h"
@@ -12,61 +13,113 @@ namespace LIBC_NAMESPACE_DECL {
1213namespace benchmarks {
1314
1415FixedVector<Benchmark *, 64 > benchmarks;
15- cpp::array<BenchmarkResult, 1024 > results;
1616
1717void Benchmark::add_benchmark (Benchmark *benchmark) {
1818 benchmarks.push_back (benchmark);
1919}
2020
21- BenchmarkResult
22- reduce_results (const cpp::array<BenchmarkResult, 1024 > &results) {
23- BenchmarkResult result;
24- uint64_t cycles_sum = 0 ;
25- double standard_deviation_sum = 0 ;
26- uint64_t min = UINT64_MAX;
27- uint64_t max = 0 ;
28- uint32_t samples_sum = 0 ;
29- uint32_t iterations_sum = 0 ;
30- clock_t time_sum = 0 ;
31- uint64_t num_threads = gpu::get_num_threads ();
32- for (uint64_t i = 0 ; i < num_threads; i++) {
33- BenchmarkResult current_result = results[i];
34- cycles_sum += current_result.cycles ;
35- standard_deviation_sum += current_result.standard_deviation ;
36- min = cpp::min (min, current_result.min );
37- max = cpp::max (max, current_result.max );
38- samples_sum += current_result.samples ;
39- iterations_sum += current_result.total_iterations ;
40- time_sum += current_result.total_time ;
21+ struct AtomicBenchmarkSums {
22+ cpp::Atomic<uint64_t > cycles_sum = 0 ;
23+ cpp::Atomic<uint64_t > standard_deviation_sum = 0 ;
24+ cpp::Atomic<uint64_t > min = UINT64_MAX;
25+ cpp::Atomic<uint64_t > max = 0 ;
26+ cpp::Atomic<uint32_t > samples_sum = 0 ;
27+ cpp::Atomic<uint32_t > iterations_sum = 0 ;
28+ cpp::Atomic<clock_t > time_sum = 0 ;
29+ cpp::Atomic<uint64_t > active_threads = 0 ;
30+
31+ void reset () {
32+ cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
33+ active_threads.store (0 , cpp::MemoryOrder::RELAXED);
34+ cycles_sum.store (0 , cpp::MemoryOrder::RELAXED);
35+ standard_deviation_sum.store (0 , cpp::MemoryOrder::RELAXED);
36+ min.store (UINT64_MAX, cpp::MemoryOrder::RELAXED);
37+ max.store (0 , cpp::MemoryOrder::RELAXED);
38+ samples_sum.store (0 , cpp::MemoryOrder::RELAXED);
39+ iterations_sum.store (0 , cpp::MemoryOrder::RELAXED);
40+ time_sum.store (0 , cpp::MemoryOrder::RELAXED);
41+ cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
4142 }
42- result.cycles = cycles_sum / num_threads;
43- result.standard_deviation = standard_deviation_sum / num_threads;
44- result.min = min;
45- result.max = max;
46- result.samples = samples_sum / num_threads;
47- result.total_iterations = iterations_sum / num_threads;
48- result.total_time = time_sum / num_threads;
49- return result;
43+
44+ void update (const BenchmarkResult &result) {
45+ cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
46+ active_threads.fetch_add (1 , cpp::MemoryOrder::RELAXED);
47+
48+ cycles_sum.fetch_add (result.cycles , cpp::MemoryOrder::RELAXED);
49+ standard_deviation_sum.fetch_add (
50+ static_cast <uint64_t >(result.standard_deviation ),
51+ cpp::MemoryOrder::RELAXED);
52+
53+ // Perform a CAS loop to atomically update the min
54+ uint64_t orig_min = min.load (cpp::MemoryOrder::RELAXED);
55+ while (!min.compare_exchange_strong (
56+ orig_min, cpp::min (orig_min, result.min ), cpp::MemoryOrder::ACQUIRE,
57+ cpp::MemoryOrder::RELAXED))
58+ ;
59+
60+ // Perform a CAS loop to atomically update the max
61+ uint64_t orig_max = max.load (cpp::MemoryOrder::RELAXED);
62+ while (!max.compare_exchange_strong (
63+ orig_max, cpp::max (orig_max, result.max ), cpp::MemoryOrder::ACQUIRE,
64+ cpp::MemoryOrder::RELAXED))
65+ ;
66+
67+ samples_sum.fetch_add (result.samples , cpp::MemoryOrder::RELAXED);
68+ iterations_sum.fetch_add (result.total_iterations ,
69+ cpp::MemoryOrder::RELAXED);
70+ time_sum.fetch_add (result.total_time , cpp::MemoryOrder::RELAXED);
71+ cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
72+ }
73+ };
74+
75+ AtomicBenchmarkSums all_results;
76+
77+ void print_results (Benchmark *b) {
78+ constexpr auto GREEN = " \033 [32m" ;
79+ constexpr auto RESET = " \033 [0m" ;
80+
81+ BenchmarkResult result;
82+ cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
83+ int num_threads = all_results.active_threads .load (cpp::MemoryOrder::RELAXED);
84+ result.cycles =
85+ all_results.cycles_sum .load (cpp::MemoryOrder::RELAXED) / num_threads;
86+ result.standard_deviation =
87+ all_results.standard_deviation_sum .load (cpp::MemoryOrder::RELAXED) /
88+ num_threads;
89+ result.min = all_results.min .load (cpp::MemoryOrder::RELAXED);
90+ result.max = all_results.max .load (cpp::MemoryOrder::RELAXED);
91+ result.samples =
92+ all_results.samples_sum .load (cpp::MemoryOrder::RELAXED) / num_threads;
93+ result.total_iterations =
94+ all_results.iterations_sum .load (cpp::MemoryOrder::RELAXED) / num_threads;
95+ result.total_time =
96+ all_results.time_sum .load (cpp::MemoryOrder::RELAXED) / num_threads;
97+ cpp::atomic_thread_fence (cpp::MemoryOrder::RELEASE);
98+
99+ log << GREEN << " [ RUN ] " << RESET << b->get_name () << ' \n ' ;
100+ log << GREEN << " [ OK ] " << RESET << b->get_name () << " : "
101+ << result.cycles << " cycles, " << result.min << " min, " << result.max
102+ << " max, " << result.total_iterations << " iterations, "
103+ << result.total_time << " ns, "
104+ << static_cast <uint64_t >(result.standard_deviation )
105+ << " stddev (num threads: " << num_threads << " )\n " ;
50106}
51107
52108void Benchmark::run_benchmarks () {
53109 uint64_t id = gpu::get_thread_id ();
54110 gpu::sync_threads ();
55111
56112 for (Benchmark *b : benchmarks) {
57- results[id] = b->run ();
113+ if (id == 0 )
114+ all_results.reset ();
115+
58116 gpu::sync_threads ();
59- if (id == 0 ) {
60- BenchmarkResult all_results = reduce_results (results);
61- constexpr auto GREEN = " \033 [32m" ;
62- constexpr auto RESET = " \033 [0m" ;
63- log << GREEN << " [ RUN ] " << RESET << b->get_name () << ' \n ' ;
64- log << GREEN << " [ OK ] " << RESET << b->get_name () << " : "
65- << all_results.cycles << " cycles, " << all_results.min << " min, "
66- << all_results.max << " max, " << all_results.total_iterations
67- << " iterations, " << all_results.total_time << " ns, "
68- << static_cast <long >(all_results.standard_deviation ) << " stddev\n " ;
69- }
117+ auto current_result = b->run ();
118+ all_results.update (current_result);
119+ gpu::sync_threads ();
120+
121+ if (id == 0 )
122+ print_results (b);
70123 }
71124 gpu::sync_threads ();
72125}
0 commit comments