// Overview / Features / Requirements / Showcase / API / FAQ / User Guide / License
Warning
Under development
Performance
is not a number!
Single
header
/module
performance
library that combines the power of:
c++2x
,linux/perf
,llvm/mca
,gnuplot/sixel
, ...
Profiling, Analyzing, Benchmarking, Testing
names description API info
hardware/software info compiler
,cpu
,memory
,sys
,proc
,bin
[core]
low-level utilities code
,compiler
,cpu
,memory
profiler
timing (rdtsc/clock/chrono) tsc
,process_time
,thread_time
,steady_time
,profiler
counting (linux/perf) instructions
,cycles
, ...,top_down
profiler
sampling (linux/perf) instructions
,cycles
, ...,mem_loads
,mem_stores
,top_down
profiler
tracing (linux/intel_pt) instructions
,cycles
,tsc
analyzer
disassembling (llvm) mca::assembly
,mca::address
,mca::encoding
,mca::size
,mca::uops
,mca::latency
,mca::rthroughput
,mca::may_load
,mca::may_store
,mca::has_side_effects
, ...,mca::debug::source
analyzer
analyzing (llvm/mca) mca::timeline
,mca::resource_pressure
,mca::bottleneck
runner
benchmarking bench::baseline
,bench::latency
,bench::throughput
[io]
logging/plotting (gnuplot/sixel) log
,json
,report
,annotate
,plot
(hist
,box
,bar
,line
,ecdf
)[] -
inline namespace
Optimal (All Features)
intel-12th+
withPEBS
,IPT
support
libipt
-apt-get install libipt-dev
- (
clang-19+
|gcc-13+
) /c++2x+
llvm-19+
-apt-get install llvm-dev
linux-6.x+
perf-event-open
-apt-get install linux-tools-common
terminal
withsixel
support
gnuplot
-apt-get install gnuplot
Usage
/** * Info/Core */ import perf; // #include <perf> int main() { auto&& spec = perf::info::spec{{ {"time", std::chrono::system_clock::now()}, {"perf.version", perf::info::version()}, {"sys.triple", perf::info::sys::triple()}, {"cxx.name", perf::info::compiler::name()}, {"cxx.version", perf::info::compiler::version()}, {"cpu.name", perf::info::cpu::name()}, {"cpu.code_name", perf::info::cpu::code_name()}, {"cpu.version", perf::info::cpu::version()}, {"mem.cache", perf::info::memory::dcache())}, // ... }}; perf::log(spec); }/** * Profiling/Analyzing */ import perf; // #include <perf> int main() { perf::profiler profiler{ perf::stat::tsc, perf::stat::cycles, perf::trace::instructions, }; perf::analyzer analyzer{ perf::mca::assembly, perf::mca::timeline, }; auto invoke = [&](auto&& fn, auto&&... ts) { profiler.start(); perf::compiler::prevent_elision(fn(ts...)); profiler.stop(); }; invoke(fn, std::rand()); perf::log(profiler[]); perf::log(analyzer[]); analyzer << profiler[perf::trace::instructions]; perf::verify(profiler[perf::stat::tsc] > 0ns); perf::verify(analyzer[perf::mca::timeline][0u].cycle_dispatched); perf::verify(analyzer[perf::mca::assembly][0u].contains("add")); }/** * Benchmarking */ import perf; // #include <perf> int main() { perf::runner bench{perf::bench::latency{}}; bench(fn, perf::data::sequence<int>{{3, 5, 15}}); bench(fn, perf::data::range<int>{.start = 0, .stop = 15}); bench(fn, perf::data::unpredictable<int>); perf::report(bench[perf::stat::tsc]); perf::plot::bar(bench[perf::stat::tsc]); perf::annotate<perf::vsplit>(bench[perf::mca::assembly]); }Build & Test
# module clang++ -std=c++23 -O3 -I. --precompile perf.cppm # -DNTEST disables compile-time tests clang++ -std=c++23 -O3 -fprebuilt-module-path=. perf.pcm <source_file> -lLLVM -lipt # header $CXX -std=c++23 -O3 -I. <source_file> -lLLVM -lipt # -DNTEST disables compile-time tests.github/scripts/tune.sh # See #FAQ for more
Export & Share
./a.out | .github/scripts/export.sh html | gh gist create --public
scripts/export.sh
-html
,markdown
,notebook
gh
-apt-get install gh
Configuration
/** * PERF version (read-only) # https://semver.org */ #define PERF (MAJOR, MINOR, PATCH) // ex. (1, 0, 0)/** * GNU # default: deduced based on `__GNUC__` * - 0 not compatible * - 1 compatible */ #define PERF_GNU 0/1 /** * Linux # default: deduced based on `__linux__` * - 0 not supported * - 1 supported */ #define PERF_LINUX 0/1 /** * UEFI # default: 0 * - 0 not supported * - 1 supported */ #define PERF_UEFI 0/1 /** * LLVM # default: deduced based on `llvm-dev` headers * - 0 not supported * - 1 supported */ #define PERF_LLVM 0/1 /** * Intel Processor Trace # default: deduced based on `intel_pt` headers * - 0 not supported * - 1 supported */ #define PERF_INTEL 0/1 /** * I/O support # default: 1 * - 0 not compiled * - 1 supported (`log, json, report, annotate, plot`) */ #define PERF_IO 0/1 /** * tests # default: not-defined * - defined: disables all compile-time, run-time tests * - not-defined: compile-time tests executed, * run-time tests available by `perf::self::test()` API */ #define NTEST/** * gnuplot terminal # see `gnuplot -> set terminal` # default: 'sixel' * - 'sixel' # console image # https://www.arewesixelyet.com * - 'wxt' # popup window * - 'canvas' # html * - 'dumb size 150,25 ansi' # console with colors * - 'dumb size 80,25' # console */ ENV:PERF_IO_PLOT_TERM /** * style # default: dark * - light * - dark */ ENV:PERF_IO_PLOT_STYLEInfo/Core
namespace perf::info { /** * static_assert(version().major == 1); * static_assert(version().minor == 0); * static_assert(version().patch == 0); */ inline constexpr auto version = [] -> sem_ver; } // namespace perf::infonamespace perf::info::compiler { /** * verify(name() == "clang"s); */ inline constexpr auto name = [] -> std::string_view; /** * static_assert(version().major == 20); * static_assert(version().minor == 0); * static_assert(version().patch == 0); */ inline constexpr auto version = [] -> sem_ver; } // namespace perf::info::compiler// perf::info::cpu::name assert(perf::info::cpu::name() == "12th Gen Intel(R) Core(TM) i7-12650"s); // perf::info::cpu::code_name assert(perf::info::cpu::code_name() == "alderlake"s); // perf::info::cpu::version assert(perf::info::cpu::version().family == 6); assert(perf::info::cpu::version().model == 154); assert(perf::info::cpu::version().stepping == 3); // perf::info::cpu::dispatch_width assert(perf::info::cpu::dispatch_width() == 6); // perf::info::cpu::features assert(perf::info::cpu::features() == std::vector{"avx", "avx2", "bmi", ...});// info::memory::icache assert(perf::info::memory::icache() == std::map{{level::L1, {.size = 448KiB, .line_size = 64, .assoc = 8}}} ); // info::memory::dcache assert(perf::info::memory::dcache() == std::map{ {level::L1, {.size = 416KiB, .line_size = 64, .assoc = 8}}, ...}, {level::L2, {.size = 9.5MiB, .line_size = 64, .assoc = 12}}, ...}, {level::L3, {.size = 24Mib, .line_size = 64, .assoc = 12}}, ...}, });// info::sys::name assert(perf::info::sys::name() == "linux"s); // info::sys::triple assert(perf::info::sys::triple() == "x86_64-pc-linux-gnu"s); // info::sys::page_size assert(perf::info::sys::page_size() == 4096b);// info::proc::name assert(perf::info::proc::self::name() == "/full/path/example.out"s); // info::proc::base_address assert(perf::info::proc::self::base_address());// info::bin::addr_to_fn_name static auto fn = [] {}; auto&& fn_name = perf::info::bin::addr_to_fn_name( perf::info::proc::self::name(), std::uint64_t(&fn) - perf::info::proc::self::base_address() ); assert(fn_name.has_value() and *fn_name == "fn"s); // info::bin::addr_to_name static auto var = 0; auto&& var_name = perf::info::bin::addr_to_name( perf::info::proc::self::name(), std::uint64_t(&var) - perf::info::proc::self::base_address() ); assert(var_name.has_value() and *var_name == "var"s); // info::bin::addr_to_line # requires debug symbols (-g) label:; auto&& source = perf::info::bin::addr_to_line( perf::info::proc::self::name(), std::uint64_t(&&label) - perf::info::proc::self::base_address() ); assert(source.has_value() and source->contains("label:;"));// code::align perf::code::align<std::align_val_t(64u)>(); for (...) { } // code::label perf::code::label<"begin">(); // begin: perf::code::label<"end">(); // end: assert(perf::code::labels["begin"] != perf::code::labels["end"]);// compiler::prevent_reorder # std::atomic_signal_fence perf::compiler::prevent_reorder(); // copmiler::prevent_elision int i{}; assert(perf::compiler::prevent_elision(i++)); // copiler::is_elided assert(perf::compiler::is_elided([] { })); assert(perf::compiler::is_elided([] { int i{}; i++; })); assert(not perf::compiler::is_elided([] { int i{}; perf::compiler::prevent_elision(i++); }));// cpu::pipeline::flush perf::cpu::pipeline::flush();// memory::align/memory::is_aligned auto aligned_addr = perf::memory::align<perf::memory::direction::up>( addr, std::align_val_t(64u) ); assert(perf::memory::is_aligned(aligned_addr, std::align_val_t(64u))); // memory::synchronize # std::atomic_thread_fence perf::memory::synchronize(); // memory::prefetch perf::memory::prefetch<perf::memory::operation::write, perf::memory::locality::high>(addr); // memory::lock # scoped{mlockall, munlockall} { perf::memory::lock _; } // memory::protect const std::array add{ // x86-64 0x89, 0xf8, // mov eax, edi 0x01, 0xf0, // add eax, esi 0xc3 // ret }; perf::memory::protect( std::span(add), perf::memory::protection::read | perf::memory::protection::write | perf::memory::protection::exec) ); assert(invoke(add, 1, 2) == 3); assert(invoke(add, 2, 3) == 5); // memory::pollute # pollutes memory by making allocations perf::memory::pollute(1024u); // memory::pre_fault # touches all pages used by data perf::memory::pre_fault(std::span(...)); // memory::flush # clears all cache lines used by data perf::memory::cache::flush(std::span(...));// sys::affinity perf::thread::affinity::set(perf::thread::self, 2u); assert(2u == perf::thread::affinity::get(perf;:thread::self));Profiling/Analyzing
Benchmarking/Testing
Setup Guide
How to setup
perf
docker?docker build -t perf .
docker run \ -it \ --privileged \ --network=host \ -e DISPLAY=${DISPLAY} \ -v ${PWD}:${PWD} \ -w ${PWD} \ perfHow to install
perf
depenencies?apt-get install linux-tools-common # linux-perf (perf::stat/perf::record) apt-get install llvm-dev # llvm (perf::mc/perf::mca) apt-get install libipt-dev # libipt (perf::trace) apt-get install gnuplot # (perf::plot)How to setup
linux performance counters
?.github/scripts/setup.sh --perf # --rdpmc --max-sample-rate 10000
sudo mount -o remount,mode=755 /sys/kernel/debug sudo mount -o remount,mode=755 /sys/kernel/debug/tracing sudo chown `whoami` /sys/kernel/debug/tracing/uprobe_events sudo chmod a+rw /sys/kernel/debug/tracing/uprobe_events echo 0 | sudo tee /proc/sys/kernel/kptr_restrict echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid echo 1000 | sudo tee /proc/sys/kernel/perf_event_max_sample_rateecho 2 | sudo tee /sys/devices/cpu_core/rdpmcHow to reduce
execution variability
?.github/scripts/tune.sh
pyperf
-pip3 install pyperf
sudo pyperf system tune sudo pyperf system show sudo pyperf system reset# Set Process CPU Affinity (apt install util-linux) taskset -c 0 ./a.out # Set Process Scheduling Priority (apt install coreutils) nice -n -20 taskset -c 0 ./a.out # -20..19 (most..less favorable to the process) # Disable CPU Frequency Scaling (apt install cpufrequtils) sudo cpupower frequency-set --governor performance # cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor # Disable Address Space Randomization echo 0 > /proc/sys/kernel/randomize_va_space # Disable Processor Boosting echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost # Disable Turbo Mode echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo # Disable Hyperthreading/SMT echo off | sudo tee /sys/devices/system/cpu/smt/control # Restrict memory to a single socket numactl -m 0 -N 0 ./a.out # Enable Huge Pages sudo numactl --cpunodebind=1 --membind=1 hugeadm \ --obey-mempolicy --pool-pages-min=1G:64 sudo hugeadm --create-mounts# Enable Kernel Mode Task-Isolation (https://lwn.net/Articles/816298) # cat /sys/devices/system/cpu/isolated isolcpus=<cpu number>,...,<cpu number> # Disable P-states and C-states # cat /sys/devices/system/cpu/intel_pstate/status idle=pool intel_pstate=disable intel_idle.max_cstate=0 processor.max_cstate=1 # Disable NMI watchdog # cat /proc/sys/kernel/nmi_watchdog nmi_watchdog=0clang++-std=c++20 -I. \ -target x86_64-pc-win32-coff \ -fno-stack-protector \ -fshort-wchar \ -mno-red-zone \ -c uefi.cpp -o uefi.o lld-link \ -filealign:16 \ -subsystem:efi_application \ -nodefaultlib -dll \ -entry:efi_main \ -out:BOOTX64.EFI \ uefi.o mkdir -p efi/boot && cp BOOTX64.EFI /usr/share/ovmf/OVMF.fd efi/boot qemu-system-x86_64 \ -drive if=pflash,format=raw,file=efi/boot/OVMF.fd \ -drive format=raw,file=fat:rw:. \ -net noneUsage Guide
How to compile
perf
withmodules
?clang++ -std=c++23 -O3 -I. --precompile perf.cppm clang++ -std=c++23 -O3 -fprebuilt-module-path=. perf.pcm <source_file> -lLLVM -liptimport perf;
How to change
assembly
syntax?perf::llvm llvm{ {.syntax = perf::arch::syntax::att} // default: intel };How to
analyze
for a different architecture?perf::llvm llvm{ .triple = "x86_64-pc-linux-gnu" // see `llvm-llc` for details };Which
terminal
can display images?Any terminal with sixel support - https://www.arewesixelyet.com
(Visual Studio Code
images support in terminal -Terminal -> Enable images option
)How to change plotting
terminal
?PERF_IO_PLOT_TERM='sixel' # terminal - sixel PERF_IO_PLOT_TERM='dumb size 80,25' # terminal asci PERF_IO_PLOT_TERM='dumb size 150,25 ansi' # terminal ansi PERF_IO_PLOT_TERM='wxt' # popup windows PERF_IO_PLOT_TERM='canvas' # html PERF_IO_PLOT_TERM='png' # pnggnuplot: set terminal # available optionsHow to change plot style?
PERF_IO_PLOT_STYLE='dark' # dark - default PERF_IO_PLOT_STYLE='light' # lightHow to save plot?
perf::plot::gnuplot plt{{.term = "png"}}; plt.send("set output 'output.png'"); perf::plot::bar(plt, ...);How to
export
results?./a.out 2>&1 | .github/scripts/export.sh markdown > results.md ./a.out 2>&1 | .github/scripts/export.sh notebook > results.ipynb ./a.out 2>&1 | .github/scripts/export.sh html > results.htmlHow to
share
results?
gh
-apt-get install gh
# https://jbt.github.io/markdown-editor gh gist create --public --web results.md
# https://jupyter.org gh gist create --public --web results.ipynb
# https://htmlpreview.github.io gh gist create --public --web results.html
How to write custom
profiler
?struct my_profiler { constexpr auto start(); constexpr auto stop(); [[nodiscard]] constexpr auto operator[](Ts...) const; };static_assert(perf::profiler_like<my_profiler>);
perf::runner bench{ [](auto&& fn, auto&&... ts) { my_profiler profiler{}; profiler.start(); perf::compiler::prevent_elision(fn(ts...)); profiler.stop(); } };How to integrate with
profiling
tools?
linux-perf
-apt get install linux-tools-common
intel-vtune
-apt get install intel-oneapi-vtune
amd-uprof
-https://www.amd.com/en/developer/uprof.html#downloads
gperftools
-apt get install google-perftools
llvm-xray
-apt-get install llvm
callgrind
-apt-get install valgrind
#include <fcntl.h> #include <unistd.h> /* * https://perf.wiki.kernel.org * https://perfwiki.github.io * * int main() { * linux_perf profiler{"/dev/shm/perf"}; * profiler.start(); * // ... * profiler.stop(); * } * * $CXX -std=c++20 -O3 -g perf.cpp * perf record --control=fifo:/dev/shm/perf --delay=-1 ./a.out */ class linux_perf { static constexpr auto enable = "enable\n"; static constexpr auto disable = "disable\n"; public: constexpr explicit linux_perf(std::string&& control) : fd_{open(control.c_str(), O_WRONLY)} { } constexpr linux_perf(const linux_perf&) = delete; constexpr linux_perf(linux_perf&& other) : fd_{std::move(other.fd_)} { other.fd_ = -1; } constexpr ~linux_perf() noexcept { if (fd_ == -1) return; close(fd_); } constexpr auto start() { return write(fd_, enable, sizeof(enable)); } constexpr auto stop() { return write(fd_, disable, sizeof(disable)); } private: int fd_{}; };#include <ittnotify.h> /** * https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html * https://github.com/intel/ittapi * * int main() { * intel_vtune profiler{"domain", "task"}; * profiler.start(); * // ... * profiler.stop(); * } * * $CXX -std=c++20 -O3 -g vtune.cpp -littnotify * vtune -collect performance-snapshot -start-paused -finalization-mode=full -r result -- ./a.out */ class intel_vtune { public: constexpr explicit intel_vtune(std::string&& domain, std::string&& task) : domain_{__itt_domain_create(domain.c_str())}, task_name_{__itt_string_handle_create(task.c_str())} { __itt_task_begin(domain_, __itt_null, __itt_null, task_name_); assert(__builtin_cpu_is("intel")); } constexpr intel_vtune(intel_vtune&&) = default; constexpr intel_vtune(const intel_vtune&) = delete; constexpr ~intel_vtune() noexcept { __itt_task_end(domain_); } constexpr auto start() { return __itt_resume(); } constexpr auto stop() { return __itt_pause(); } private: __itt_domain* domain_{}; __itt_string_handle* task_name_{}; };#include <AMDProfileController.h> /** * https://www.amd.com/en/developer/uprof.html * * int main() { * amd_uprof profiler{}; * profiler.start(); * // ... * profiler.stop(); * } * $CXX -std=c++20 -O3 -g uprof.cpp -lAMDProfileController * AMDuProfCLI collect --config tbp --start-paused ./a.out */ class amd_uprof { public: constexpr amd_uprof() { assert(__builtin_cpu_is("amd")); } constexpr amd_uprof(amd_uprof&&) = default; constexpr amd_uprof(const amd_uprof&) = delete; constexpr auto start() { return amdProfileResume(); } constexpr auto stop() { return amdProfilePause(); } };#include <gperftools/profiler.h> /** * int main() { * gperf profiler{"gperf"}; * profiler.start(); * // ... * profiler.stop(); * profiler.flush(); // optional * } * * $CXX -g -O3 perf.cpp -lprofiler * CPUPROFILE_FREQUENCY=1000 ./a.out * google-pprof a.out profile.prof */ class gperf { public: constexpr explicit gperf(std::string&& fname) : fname_{std::move(fname)} { } constexpr gperf(gperf&&) = default; constexpr gperf(const gperf&) = delete; constexpr ~gperf() noexcept { flush(); } constexpr auto start() { return ProfilerStart(fname_.c_str()); } constexpr auto stop() { return ProfilerStop(); } constexpr void flush() { return ProfilerFlush(); } private: std::string fname_{}; };#include <xray/xray_interface.h> #include <xray/xray_log_interface.h> /** * https://llvm.org/docs/XRay.html * https://godbolt.org/z/WhsEYf9cc * * int main() { * llvm_xray profiler{"xray-fdr"}; * profiler.start(); * // ... * profiler.stop(); * profiler.flush(); // optional * } * * [[clang::xray_always_instrument]] void always_profile(); * [[clang::xray_always_instrument, clang::xray_log_args(1)]] void always_profile_and_log_i(int i); * [[clang::xray_never_instrument]] void never_profile(); * * void handler([[maybe_unused]] int32_t func_id, XRayEntryType entry) { * if (entry == XRayEntryType::ENTRY) { * // profiler.start(); * } else { * // profiler.stop(); * } * } * * int main() { * __xray_set_handler(handler); * __xray_patch(); * } * * clang++ std=c++20 -O3 -g xray.cpp -fxray-instrument -fxray-instruction-threshold=1 * ./a.out * llvm-xray account xray-log.* --top=10 --sort=sum --sortorder=dsc --instr_map=./a.out * llvm-xray extract ./a.out --symbolize */ class llvm_xray { public: constexpr explicit llvm_xray( std::string&& mode = "xray-fdr", std::string&& cfg = "xray_logfile_base=xray-log.%" ) { __xray_log_select_mode(mode.c_str()); __xray_log_init_mode(mode.c_str(), cfg.c_str()); } constexpr llvm_xray(llvm_xray&&) = default; constexpr llvm_xray(const llvm_xray&) = delete; constexpr ~llvm_xray() noexcept { flush(); } [[clang::xray_never_instrument]] constexpr auto start() { return __xray_patch(); } [[clang::xray_never_instrument]] constexpr auto stop() { return __xray_unpatch(); } [[clang::xray_never_instrument]] constexpr void flush() { __xray_log_finalize(); __xray_log_flushLog(); } };#include <valgrind/callgrind.h> /** * int main() { * callgrind callgrind{"profile"}; * * while (true) { * profiler.start(); // resets profile * * if (should_trigger()) { * trigger(); * profiler.stop(); * proflier.flush(); // dumps `example` profile * } * } * } * * $CXX std=c++20 -O3 -g callgrind.cpp * valgrind --tool=callgrind --instr-atstart=no \ # 5x-100x overhead * --cache-sim=yes --branch-sim=yes --collect-jumps=yes --dump-instr=yes a.out * kcachegrind callgrind.* */ class callgrind { public: constexpr explicit callgrind(std::string&& profile) : profile_{std::move(profile)} { } constexpr callgrind(callgrind&&) = default; constexpr callgrind(const callgrind&) = delete; constexpr ~callgrind() noexcept { flush(); } constexpr auto start() { CALLGRIND_START_INSTRUMENTATION; } constexpr auto stop() { CALLGRIND_STOP_INSTRUMENTATION; } constexpr void flush() { CALLGRIND_DUMP_STATS_AT(profile_.c_str()); } private: std::string profile_{}; };How to integrate with
unit-testing
frameworks?import perf; import ut; // https://github.com/qlibs/ut int main() { perf::runner bench{perf::bench::latency{}}; perf::scoped _ { .on_exit = [&] { perf::report(bench[perf::stat::cpu_time]); } }; "benchmark1"_test = [] { bench(fn1, ts1...); }; "benchmark1"_test = [] { bench(fn2, ts2...); }; }How
perf
tests are working?
compile-time
tests are executed uponinclude/import
(enabled by default)
run-time/sanity check
tests can be executed at run-timeint main() { perf::self::test({.verbose = true}); // run-time/sanity check tests }
-DNTEST
can be used to disable tests (not recommended)$CXX -DNTEST ... # tests will NOT be compiled
perf
tests execution model#ifndef NTEST "perf"_suite = [] { "run-time and compile-time"_test = [] constexpr { expect(3 == accumulate({1, 2, 3}, 0)); }; "run-time"_test = [] mutable { expect(std::rand() >= 0); }; "compile-time"_test = [] consteval { expect(sizeof(int) == sizeof(0)); }; }; #endifHow to integrate with
jupyter
?
jupyter
(apt install jupyter
) can be used for data analysis (python
)int main() { // ... perf perf::json("perf.json"); }# notebook.ipynb import pandas as pd df = pd.read_json("perf.json") print(df.head())jupyter notebook notebook.ipynbWhat is the difference between
latency
andthroughput
?
latency
is the time it takes for a single operation to complete (ns)
throughput
is the total number of operations or tasks completed in a given amount of time (op/s)What is
top-down microarchitecture analysis
method?https://www.intel.com/content/www/us/en/docs/vtune-profiler/cookbook/2023-0/top-down-microarchitecture-analysis-method.html
https://github.com/andikleen/pmu-tools/wiki/toplev-manualWhat are
performance
compilation flags?-O1 # optimizations (O1) [0] -O2 # optimizations (O1 + O2) [0] -O3 # optimizations (O1 + O2 + O3) [0] -march=native # architecture specific [1] -DNDEBUG # disables asserts, etc.-fno-omit-frame-pointer # keeps the frame pointer in a register
-ffast-math # [unsafe] faster but non-conforming math [2] -fcf-protection=none # [unsafe] stops emmitting `endbr64`[0] https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html
[1] https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html
[2] https://gcc.gnu.org/wiki/FloatingPointMathWhat are
performance
compiler attributes?
gnu::target
[[gnu::target("avx2")]] [[gnu::target("bmi")]]
gnu::optimize
[[gnu::optimize("O3")] [[gnu::optimize("ffast-math")]https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html
MIT/Apache2+LLVM
license namespace guard description MIT perf::*
- https://opensource.org/license/mit Apache2+LLVM perf::mca::*
PERF_LLVM == 1
https://llvm.org/LICENSE.txt