GitHub

// Overview / Features / Requirements / Showcase / API / FAQ / User Guide / License

`perf`: C++2x Performance library

Warning

Under development

Performance is not a number!

Overview

Single header/module performance library that combines the power of:
c++2x, linux/perf, llvm/mca, gnuplot/sixel, ...

Features

Profiling, Analyzing, Benchmarking, Testing

names description API

info hardware/software info compiler, cpu, memory, sys, proc, bin

[core] low-level utilities code, compiler, cpu, memory

profiler timing (rdtsc/clock/chrono) tsc, process_time, thread_time, steady_time,

profiler counting (linux/perf) instructions, cycles, ..., top_down

profiler sampling (linux/perf) instructions, cycles, ..., mem_loads, mem_stores, top_down

profiler tracing (linux/intel_pt) instructions, cycles, tsc

analyzer disassembling (llvm) mca::assembly, mca::address, mca::encoding, mca::size, mca::uops, mca::latency, mca::rthroughput, mca::may_load, mca::may_store, mca::has_side_effects, ..., mca::debug::source

analyzer analyzing (llvm/mca) mca::timeline, mca::resource_pressure, mca::bottleneck

runner benchmarking bench::baseline, bench::latency, bench::throughput

[io] logging/plotting (gnuplot/sixel) log, json, report, annotate, plot (hist, box, bar, line, ecdf)

[] - inline namespace

Requirements

Minimal (Core functionality)

(clang-16+ | gcc-12+) / c++2x+

Optimal (All Features)

intel-12th+ with PEBS, IPT support

libipt - apt-get install libipt-dev

(clang-19+ | gcc-13+) / c++2x+

llvm-19+ - apt-get install llvm-dev

linux-6.x+

perf-event-open - apt-get install linux-tools-common

terminal with sixel support

gnuplot - apt-get install gnuplot

Auxiliary (Enhancements)

ut - https://github.com/qlibs/ut

gh - apt-get install gh

Dockerfile

Showcase

Usage

/**
 * Info/Core
 */
import perf; // #include <perf>

int main() {
  auto&& spec = perf::info::spec{{
    {"time",          std::chrono::system_clock::now()},
    {"perf.version",  perf::info::version()},
    {"sys.triple",    perf::info::sys::triple()},
    {"cxx.name",      perf::info::compiler::name()},
    {"cxx.version",   perf::info::compiler::version()},
    {"cpu.name",      perf::info::cpu::name()},
    {"cpu.code_name", perf::info::cpu::code_name()},
    {"cpu.version",   perf::info::cpu::version()},
    {"mem.cache",     perf::info::memory::dcache())},
    // ...
  }};

  perf::log(spec);
}

/**
 * Profiling/Analyzing
 */
import perf; // #include <perf>

int main() {
  perf::profiler profiler{
    perf::stat::tsc, perf::stat::cycles, perf::trace::instructions,
  };
  perf::analyzer analyzer{
    perf::mca::assembly, perf::mca::timeline,
  };

  auto invoke = [&](auto&& fn, auto&&... ts) {
    profiler.start();
    perf::compiler::prevent_elision(fn(ts...));
    profiler.stop();
  };

  invoke(fn, std::rand());

  perf::log(profiler[]);
  perf::log(analyzer[]);

  analyzer << profiler[perf::trace::instructions];

  perf::verify(profiler[perf::stat::tsc] > 0ns);
  perf::verify(analyzer[perf::mca::timeline][0u].cycle_dispatched);
  perf::verify(analyzer[perf::mca::assembly][0u].contains("add"));
}

/**
 * Benchmarking
 */
import perf; // #include <perf>

int main() {
  perf::runner bench{perf::bench::latency{}};

  bench(fn, perf::data::sequence<int>{{3, 5, 15}});
  bench(fn, perf::data::range<int>{.start = 0, .stop = 15});
  bench(fn, perf::data::unpredictable<int>);

  perf::report(bench[perf::stat::tsc]);
  perf::plot::bar(bench[perf::stat::tsc]);
  perf::annotate<perf::vsplit>(bench[perf::mca::assembly]);
}

API

Build & Test

# module
clang++ -std=c++23 -O3 -I. --precompile perf.cppm # -DNTEST disables compile-time tests
clang++ -std=c++23 -O3 -fprebuilt-module-path=. perf.pcm <source_file> -lLLVM -lipt

# header
$CXX -std=c++23 -O3 -I. <source_file> -lLLVM -lipt # -DNTEST disables compile-time tests

.github/scripts/tune.sh # See #FAQ for more

scripts/tune.sh
Dockerfile

Export & Share

./a.out | .github/scripts/export.sh html | gh gist create --public

scripts/export.sh - html, markdown, notebook
gh - apt-get install gh

Studies

API

Configuration

/**
 * PERF version (read-only) # https://semver.org
 */
#define PERF (MAJOR, MINOR, PATCH) // ex. (1, 0, 0)

/**
 * GNU # default: deduced based on `__GNUC__`
 * - 0 not compatible
 * - 1 compatible
 */
#define PERF_GNU 0/1

/**
 * Linux # default: deduced based on `__linux__`
 * - 0 not supported
 * - 1 supported
 */
#define PERF_LINUX 0/1

/**
 * UEFI # default: 0
 * - 0 not supported
 * - 1 supported
 */
#define PERF_UEFI 0/1

/**
 * LLVM # default: deduced based on `llvm-dev` headers
 * - 0 not supported
 * - 1 supported
 */
#define PERF_LLVM 0/1

/**
 * Intel Processor Trace # default: deduced based on `intel_pt` headers
 * - 0 not supported
 * - 1 supported
 */
#define PERF_INTEL 0/1

/**
 * I/O support # default: 1
 * - 0 not compiled
 * - 1 supported (`log, json, report, annotate, plot`)
 */
#define PERF_IO 0/1

/**
 * tests # default: not-defined
 * - defined:     disables all compile-time, run-time tests
 * - not-defined: compile-time tests executed,
 *                run-time tests available by `perf::self::test()` API
 */
#define NTEST

/**
 * gnuplot terminal # see `gnuplot -> set terminal` # default: 'sixel'
 * - 'sixel'                  # console image # https://www.arewesixelyet.com
 * - 'wxt'                    # popup window
 * - 'canvas'                 # html
 * - 'dumb size 150,25 ansi'  # console with colors
 * - 'dumb size 80,25'        # console
 */
ENV:PERF_IO_PLOT_TERM

/**
 * style # default: dark
 * - light
 * - dark
 */
ENV:PERF_IO_PLOT_STYLE

Info/Core

namespace perf::info {
  /**
   * static_assert(version().major == 1);
   * static_assert(version().minor == 0);
   * static_assert(version().patch == 0);
   */
  inline constexpr auto version = [] -> sem_ver;
} // namespace perf::info

namespace perf::info::compiler {
  /**
   * verify(name() == "clang"s);
   */
  inline constexpr auto name = [] -> std::string_view;

  /**
   * static_assert(version().major == 20);
   * static_assert(version().minor == 0);
   * static_assert(version().patch == 0);
   */
  inline constexpr auto version = [] -> sem_ver;
} // namespace perf::info::compiler

// perf::info::cpu::name
assert(perf::info::cpu::name() == "12th Gen Intel(R) Core(TM) i7-12650"s);

// perf::info::cpu::code_name
assert(perf::info::cpu::code_name() == "alderlake"s);

// perf::info::cpu::version
assert(perf::info::cpu::version().family == 6);
assert(perf::info::cpu::version().model == 154);
assert(perf::info::cpu::version().stepping == 3);

// perf::info::cpu::dispatch_width
assert(perf::info::cpu::dispatch_width() == 6);

// perf::info::cpu::features
assert(perf::info::cpu::features() == std::vector{"avx", "avx2", "bmi", ...});

// info::memory::icache
assert(perf::info::memory::icache() ==
  std::map{{level::L1, {.size = 448KiB, .line_size = 64, .assoc = 8}}}
);

// info::memory::dcache
assert(perf::info::memory::dcache() == std::map{
  {level::L1, {.size = 416KiB, .line_size = 64, .assoc = 8}}, ...},
  {level::L2, {.size = 9.5MiB, .line_size = 64, .assoc = 12}}, ...},
  {level::L3, {.size = 24Mib,  .line_size = 64, .assoc = 12}}, ...},
});

// info::sys::name
assert(perf::info::sys::name() == "linux"s);

// info::sys::triple
assert(perf::info::sys::triple() == "x86_64-pc-linux-gnu"s);

// info::sys::page_size
assert(perf::info::sys::page_size() == 4096b);

// info::proc::name
assert(perf::info::proc::self::name() == "/full/path/example.out"s);

// info::proc::base_address
assert(perf::info::proc::self::base_address());

// info::bin::addr_to_fn_name
static auto fn = [] {};
auto&& fn_name = perf::info::bin::addr_to_fn_name(
  perf::info::proc::self::name(),
  std::uint64_t(&fn) - perf::info::proc::self::base_address()
);
assert(fn_name.has_value() and *fn_name == "fn"s);

// info::bin::addr_to_name
static auto var = 0;
auto&& var_name = perf::info::bin::addr_to_name(
  perf::info::proc::self::name(),
  std::uint64_t(&var) - perf::info::proc::self::base_address()
);
assert(var_name.has_value() and *var_name == "var"s);

// info::bin::addr_to_line # requires debug symbols (-g)
label:; auto&& source = perf::info::bin::addr_to_line(
  perf::info::proc::self::name(),
  std::uint64_t(&&label) - perf::info::proc::self::base_address()
);
assert(source.has_value() and source->contains("label:;"));

// code::align
perf::code::align<std::align_val_t(64u)>(); for (...) { }

// code::label
perf::code::label<"begin">(); // begin:
perf::code::label<"end">();   // end:
assert(perf::code::labels["begin"] != perf::code::labels["end"]);

// compiler::prevent_reorder # std::atomic_signal_fence
perf::compiler::prevent_reorder();

// copmiler::prevent_elision
int i{};
assert(perf::compiler::prevent_elision(i++));

// copiler::is_elided
assert(perf::compiler::is_elided([] { }));
assert(perf::compiler::is_elided([] { int i{}; i++; }));
assert(not perf::compiler::is_elided([] {
  int i{};
  perf::compiler::prevent_elision(i++);
}));

// cpu::pipeline::flush
perf::cpu::pipeline::flush();

// memory::align/memory::is_aligned
auto aligned_addr = perf::memory::align<perf::memory::direction::up>(
  addr, std::align_val_t(64u)
);
assert(perf::memory::is_aligned(aligned_addr, std::align_val_t(64u)));

// memory::synchronize # std::atomic_thread_fence
perf::memory::synchronize();

// memory::prefetch
perf::memory::prefetch<perf::memory::operation::write,
                       perf::memory::locality::high>(addr);

// memory::lock # scoped{mlockall, munlockall}
{
  perf::memory::lock _;
}

// memory::protect
const std::array add{ // x86-64
  0x89, 0xf8,         // mov eax, edi
  0x01, 0xf0,         // add eax, esi
  0xc3                // ret
};
perf::memory::protect(
  std::span(add),
  perf::memory::protection::read |
  perf::memory::protection::write |
  perf::memory::protection::exec)
);
assert(invoke(add, 1, 2) == 3);
assert(invoke(add, 2, 3) == 5);

// memory::pollute # pollutes memory by making allocations
perf::memory::pollute(1024u);

// memory::pre_fault # touches all pages used by data
perf::memory::pre_fault(std::span(...));

// memory::flush # clears all cache lines used by data
perf::memory::cache::flush(std::span(...));

// sys::affinity
perf::thread::affinity::set(perf::thread::self, 2u);
assert(2u == perf::thread::affinity::get(perf;:thread::self));

Profiling/Analyzing

https://kris-jusiak.github.io/talks/cppcon-2025

Benchmarking/Testing

https://kris-jusiak.github.io/talks/cppcon-2025

Synopsis

User Guide

Setup Guide

How to setup perf docker?

Dockerfile

docker build -t perf .

docker run \
  -it \
  --privileged \
  --network=host \
  -e DISPLAY=${DISPLAY} \
  -v ${PWD}:${PWD} \
  -w ${PWD} \
  perf

How to install perf depenencies?

apt-get install linux-tools-common # linux-perf (perf::stat/perf::record)
apt-get install llvm-dev           # llvm (perf::mc/perf::mca)
apt-get install libipt-dev         # libipt (perf::trace)
apt-get install gnuplot            # (perf::plot)

How to setup linux performance counters?

scripts/setup.sh

.github/scripts/setup.sh --perf # --rdpmc --max-sample-rate 10000

linux

sudo mount -o remount,mode=755 /sys/kernel/debug
sudo mount -o remount,mode=755 /sys/kernel/debug/tracing
sudo chown `whoami` /sys/kernel/debug/tracing/uprobe_events
sudo chmod a+rw /sys/kernel/debug/tracing/uprobe_events
echo 0 | sudo tee /proc/sys/kernel/kptr_restrict
echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid
echo 1000 | sudo tee /proc/sys/kernel/perf_event_max_sample_rate

echo 2 | sudo tee /sys/devices/cpu_core/rdpmc

How to reduce execution variability?

scripts/tune.sh

.github/scripts/tune.sh

pyperf - pip3 install pyperf

sudo pyperf system tune
sudo pyperf system show
sudo pyperf system reset

linux

# Set Process CPU Affinity (apt install util-linux)
taskset -c 0 ./a.out

# Set Process Scheduling Priority (apt install coreutils)
nice -n -20 taskset -c 0 ./a.out # -20..19 (most..less favorable to the process)

# Disable CPU Frequency Scaling (apt install cpufrequtils)
sudo cpupower frequency-set --governor performance
# cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

# Disable Address Space Randomization
echo 0 > /proc/sys/kernel/randomize_va_space

# Disable Processor Boosting
echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost

# Disable Turbo Mode
echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo

# Disable Hyperthreading/SMT
echo off | sudo tee /sys/devices/system/cpu/smt/control

# Restrict memory to a single socket
numactl -m 0 -N 0 ./a.out

# Enable Huge Pages
sudo numactl --cpunodebind=1 --membind=1 hugeadm \
  --obey-mempolicy --pool-pages-min=1G:64
sudo hugeadm --create-mounts

bootloader

# Enable Kernel Mode Task-Isolation (https://lwn.net/Articles/816298)
# cat /sys/devices/system/cpu/isolated
isolcpus=<cpu number>,...,<cpu number>

# Disable P-states and C-states
# cat /sys/devices/system/cpu/intel_pstate/status
idle=pool intel_pstate=disable intel_idle.max_cstate=0 processor.max_cstate=1

# Disable NMI watchdog
# cat /proc/sys/kernel/nmi_watchdog
nmi_watchdog=0

uefi

clang++-std=c++20 -I. \
  -target x86_64-pc-win32-coff \
  -fno-stack-protector \
  -fshort-wchar \
  -mno-red-zone \
  -c uefi.cpp -o uefi.o

lld-link \
  -filealign:16 \
  -subsystem:efi_application \
  -nodefaultlib
  -dll \
  -entry:efi_main \
  -out:BOOTX64.EFI \
  uefi.o

mkdir -p efi/boot && cp BOOTX64.EFI /usr/share/ovmf/OVMF.fd efi/boot

qemu-system-x86_64 \
  -drive if=pflash,format=raw,file=efi/boot/OVMF.fd \
  -drive format=raw,file=fat:rw:. \
  -net none

Usage Guide

How to compile perf with modules?

clang

clang++ -std=c++23 -O3 -I. --precompile perf.cppm
clang++ -std=c++23 -O3 -fprebuilt-module-path=. perf.pcm <source_file> -lLLVM -lipt

import perf;

How to change assembly syntax?

perf::llvm llvm{
  {.syntax = perf::arch::syntax::att} // default: intel
};

How to analyze for a different architecture?

perf::llvm llvm{
  .triple = "x86_64-pc-linux-gnu" // see `llvm-llc` for details
};

Which terminal can display images?

Any terminal with sixel support - https://www.arewesixelyet.com
(Visual Studio Code images support in terminal - Terminal -> Enable images option)

How to change plotting terminal?

PERF_IO_PLOT_TERM='sixel'                 # terminal - sixel
PERF_IO_PLOT_TERM='dumb size 80,25'       # terminal asci
PERF_IO_PLOT_TERM='dumb size 150,25 ansi' # terminal ansi
PERF_IO_PLOT_TERM='wxt'                   # popup windows
PERF_IO_PLOT_TERM='canvas'                # html
PERF_IO_PLOT_TERM='png'                   # png

gnuplot: set terminal # available options

http://www.bersch.net/gnuplot-doc/complete-list-of-terminals.html

How to change plot style?

PERF_IO_PLOT_STYLE='dark'  # dark - default
PERF_IO_PLOT_STYLE='light' # light

How to save plot?

perf::plot::gnuplot plt{{.term = "png"}};
plt.send("set output 'output.png'");
perf::plot::bar(plt, ...);

How to export results?

scripts/export.sh

./a.out 2>&1 | .github/scripts/export.sh markdown > results.md
./a.out 2>&1 | .github/scripts/export.sh notebook > results.ipynb
./a.out 2>&1 | .github/scripts/export.sh html > results.html

How to share results?

gh - apt-get install gh

# https://jbt.github.io/markdown-editor
gh gist create --public --web results.md

# https://jupyter.org
gh gist create --public --web results.ipynb

# https://htmlpreview.github.io
gh gist create --public --web results.html

How to write custom profiler?

struct my_profiler {
  constexpr auto start();
  constexpr auto stop();
  [[nodiscard]] constexpr auto operator[](Ts...) const;
};

static_assert(perf::profiler_like<my_profiler>);

perf::runner bench{
  [](auto&& fn, auto&&... ts) {
    my_profiler profiler{};
    profiler.start();
    perf::compiler::prevent_elision(fn(ts...));
    profiler.stop();
  }
};

How to integrate with profiling tools?

linux-perf - apt get install linux-tools-common
intel-vtune - apt get install intel-oneapi-vtune
amd-uprof - https://www.amd.com/en/developer/uprof.html#downloads
gperftools - apt get install google-perftools
llvm-xray - apt-get install llvm
callgrind - apt-get install valgrind

#include <fcntl.h>
#include <unistd.h>
/*
 * https://perf.wiki.kernel.org
 * https://perfwiki.github.io
 *
 * int main() {
 *   linux_perf profiler{"/dev/shm/perf"};
 *   profiler.start();
 *   // ...
 *   profiler.stop();
 * }
 *
 * $CXX -std=c++20 -O3 -g perf.cpp
 * perf record --control=fifo:/dev/shm/perf --delay=-1 ./a.out
 */
class linux_perf {
  static constexpr auto enable = "enable\n";
  static constexpr auto disable = "disable\n";

 public:
  constexpr explicit linux_perf(std::string&& control)
    : fd_{open(control.c_str(), O_WRONLY)}
  { }
  constexpr linux_perf(const linux_perf&) = delete;
  constexpr linux_perf(linux_perf&& other)
    : fd_{std::move(other.fd_)} {
    other.fd_ = -1;
  }
  constexpr ~linux_perf() noexcept {
    if (fd_ == -1) return;
    close(fd_);
  }

  constexpr auto start() {
    return write(fd_, enable, sizeof(enable));
  }

  constexpr auto stop() {
    return write(fd_, disable, sizeof(disable));
  }

 private:
  int fd_{};
};

#include <ittnotify.h>
/**
 * https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html
 * https://github.com/intel/ittapi
 *
 * int main() {
 *   intel_vtune profiler{"domain", "task"};
 *   profiler.start();
 *   // ...
 *   profiler.stop();
 * }
 *
 * $CXX -std=c++20 -O3 -g vtune.cpp -littnotify
 * vtune -collect performance-snapshot -start-paused -finalization-mode=full -r result -- ./a.out
 */
class intel_vtune {
 public:
  constexpr explicit intel_vtune(std::string&& domain, std::string&& task)
    : domain_{__itt_domain_create(domain.c_str())}, task_name_{__itt_string_handle_create(task.c_str())} {
    __itt_task_begin(domain_, __itt_null, __itt_null, task_name_);
    assert(__builtin_cpu_is("intel"));
  }
  constexpr intel_vtune(intel_vtune&&) = default;
  constexpr intel_vtune(const intel_vtune&) = delete;
  constexpr ~intel_vtune() noexcept {
    __itt_task_end(domain_);
  }

  constexpr auto start() {
    return __itt_resume();
  }

  constexpr auto stop() {
    return __itt_pause();
  }

 private:
  __itt_domain* domain_{};
  __itt_string_handle* task_name_{};
};

#include <AMDProfileController.h>
/**
 * https://www.amd.com/en/developer/uprof.html
 *
 * int main() {
 *   amd_uprof profiler{};
 *   profiler.start();
 *   // ...
 *   profiler.stop();
 * }
 * $CXX -std=c++20 -O3 -g uprof.cpp -lAMDProfileController
 * AMDuProfCLI collect --config tbp --start-paused ./a.out
 */
class amd_uprof {
 public:
  constexpr amd_uprof() {
    assert(__builtin_cpu_is("amd"));
  }
  constexpr amd_uprof(amd_uprof&&) = default;
  constexpr amd_uprof(const amd_uprof&) = delete;

  constexpr auto start() {
    return amdProfileResume();
  }

  constexpr auto stop() {
    return amdProfilePause();
  }
};

#include <gperftools/profiler.h>
/**
 * int main() {
 *   gperf profiler{"gperf"};
 *   profiler.start();
 *   // ...
 *   profiler.stop();
 *   profiler.flush(); // optional
 * }
 *
 * $CXX -g -O3 perf.cpp -lprofiler
 * CPUPROFILE_FREQUENCY=1000 ./a.out
 * google-pprof a.out profile.prof
 */
class gperf {
 public:
  constexpr explicit gperf(std::string&& fname)
    : fname_{std::move(fname)}
  { }
  constexpr gperf(gperf&&) = default;
  constexpr gperf(const gperf&) = delete;
  constexpr ~gperf() noexcept {
    flush();
  }

  constexpr auto start() {
    return ProfilerStart(fname_.c_str());
  }

  constexpr auto stop() {
    return ProfilerStop();
  }

  constexpr void flush() {
    return ProfilerFlush();
  }

 private:
  std::string fname_{};
};

#include <xray/xray_interface.h>
#include <xray/xray_log_interface.h>
/**
 * https://llvm.org/docs/XRay.html
 * https://godbolt.org/z/WhsEYf9cc
 *
 * int main() {
 *   llvm_xray profiler{"xray-fdr"};
 *   profiler.start();
 *   // ...
 *   profiler.stop();
 *   profiler.flush(); // optional
 * }
 *
 * [[clang::xray_always_instrument]] void always_profile();
 * [[clang::xray_always_instrument, clang::xray_log_args(1)]] void always_profile_and_log_i(int i);
 * [[clang::xray_never_instrument]] void never_profile();
 *
 * void handler([[maybe_unused]] int32_t func_id, XRayEntryType entry) {
 *   if (entry == XRayEntryType::ENTRY) {
 *     // profiler.start();
 *   } else {
 *     // profiler.stop();
 *   }
 * }
 *
 * int main() {
 *   __xray_set_handler(handler);
 *   __xray_patch();
 * }
 *
 * clang++ std=c++20 -O3 -g xray.cpp -fxray-instrument -fxray-instruction-threshold=1
 * ./a.out
 * llvm-xray account xray-log.* --top=10 --sort=sum --sortorder=dsc --instr_map=./a.out
 * llvm-xray extract ./a.out --symbolize
 */
class llvm_xray {
 public:
  constexpr explicit llvm_xray(
    std::string&& mode = "xray-fdr",
    std::string&& cfg = "xray_logfile_base=xray-log.%"
  ) {
    __xray_log_select_mode(mode.c_str());
    __xray_log_init_mode(mode.c_str(), cfg.c_str());
  }
  constexpr llvm_xray(llvm_xray&&) = default;
  constexpr llvm_xray(const llvm_xray&) = delete;
  constexpr ~llvm_xray() noexcept {
    flush();
  }

  [[clang::xray_never_instrument]] constexpr auto start() {
    return __xray_patch();
  }

  [[clang::xray_never_instrument]] constexpr auto stop() {
    return __xray_unpatch();
  }

  [[clang::xray_never_instrument]] constexpr void flush() {
    __xray_log_finalize();
    __xray_log_flushLog();
  }
};

#include <valgrind/callgrind.h>
/**
 * int main() {
 *   callgrind callgrind{"profile"};
 *
 *   while (true) {
 *     profiler.start(); // resets profile
 *
 *     if (should_trigger()) {
 *       trigger();
 *       profiler.stop();
 *       proflier.flush(); // dumps `example` profile
 *     }
 *   }
 * }
 *
 * $CXX std=c++20 -O3 -g callgrind.cpp
 * valgrind --tool=callgrind --instr-atstart=no \ # 5x-100x overhead
 *  --cache-sim=yes --branch-sim=yes --collect-jumps=yes --dump-instr=yes a.out
 * kcachegrind callgrind.*
 */
class callgrind {
 public:
  constexpr explicit callgrind(std::string&& profile)
    : profile_{std::move(profile)}
  { }
  constexpr callgrind(callgrind&&) = default;
  constexpr callgrind(const callgrind&) = delete;
  constexpr ~callgrind() noexcept {
    flush();
  }

  constexpr auto start() {
    CALLGRIND_START_INSTRUMENTATION;
  }

  constexpr auto stop() {
    CALLGRIND_STOP_INSTRUMENTATION;
  }

  constexpr void flush() {
    CALLGRIND_DUMP_STATS_AT(profile_.c_str());
  }

 private:
  std::string profile_{};
};

How to integrate with unit-testing frameworks?

import perf;
import ut; // https://github.com/qlibs/ut

int main() {
  perf::runner bench{perf::bench::latency{}};
  perf::scoped _ {
    .on_exit = [&] {
      perf::report(bench[perf::stat::cpu_time]);
    }
  };

  "benchmark1"_test = [] {
    bench(fn1, ts1...);
  };

  "benchmark1"_test = [] {
    bench(fn2, ts2...);
  };
}

How perf tests are working?

compile-time tests are executed upon include/import (enabled by default)
run-time/sanity check tests can be executed at run-time

int main() {
  perf::self::test({.verbose = true}); // run-time/sanity check tests
}

-DNTEST can be used to disable tests (not recommended)

$CXX -DNTEST ... # tests will NOT be compiled

perf tests execution model

#ifndef NTEST
"perf"_suite = [] {
  "run-time and compile-time"_test = [] constexpr {
    expect(3 == accumulate({1, 2, 3}, 0));
  };

  "run-time"_test = [] mutable {
    expect(std::rand() >= 0);
  };

  "compile-time"_test = [] consteval {
    expect(sizeof(int) == sizeof(0));
  };
};
#endif

How to integrate with jupyter?

jupyter (apt install jupyter) can be used for data analysis (python)

int main() {
  // ... perf
  perf::json("perf.json");
}

# notebook.ipynb
import pandas as pd
df = pd.read_json("perf.json")
print(df.head())

jupyter notebook notebook.ipynb

What is the difference between latency and throughput?

latency is the time it takes for a single operation to complete (ns)
throughput is the total number of operations or tasks completed in a given amount of time (op/s)
What is top-down microarchitecture analysis method?

https://www.intel.com/content/www/us/en/docs/vtune-profiler/cookbook/2023-0/top-down-microarchitecture-analysis-method.html
https://github.com/andikleen/pmu-tools/wiki/toplev-manual

What are performance compilation flags?

-O1                     # optimizations (O1) [0]
-O2                     # optimizations (O1 + O2) [0]
-O3                     # optimizations (O1 + O2 + O3) [0]
-march=native           # architecture specific [1]
-DNDEBUG                # disables asserts, etc.

-fno-omit-frame-pointer # keeps the frame pointer in a register

-ffast-math             # [unsafe] faster but non-conforming math [2]
-fcf-protection=none    # [unsafe] stops emmitting `endbr64`

[0] https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html
[1] https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html
[2] https://gcc.gnu.org/wiki/FloatingPointMath

What are performance compiler attributes?

gnu::target
```
[[gnu::target("avx2")]]
[[gnu::target("bmi")]]
```
gnu::optimize
```
[[gnu::optimize("O3")]
[[gnu::optimize("ffast-math")]
```
https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html

Resources

License

MIT/Apache2+LLVM

license namespace guard description

MIT perf::* - https://opensource.org/license/mit

Apache2+LLVM perf::mca::* PERF_LLVM == 1 https://llvm.org/LICENSE.txt

LICENSE

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Repository files navigation

`perf`: C++2x Performance library

Overview

Features

Requirements

Showcase

API

User Guide

License

About

Uh oh!

Releases

Uh oh!

Languages

Name		Name	Last commit message	Last commit date
Latest commit History 1 Commit
.github		.github
perf		perf
perf.cppm		perf.cppm

names	description	API
`info`	hardware/software info	`compiler`, `cpu`, `memory`, `sys`, `proc`, `bin`
`[core]`	low-level utilities	`code`, `compiler`, `cpu`, `memory`
`profiler`	timing (rdtsc/clock/chrono)	`tsc`, `process_time`, `thread_time`, `steady_time`,
`profiler`	counting (linux/perf)	`instructions`, `cycles`, ..., `top_down`
`profiler`	sampling (linux/perf)	`instructions`, `cycles`, ..., `mem_loads`, `mem_stores`, `top_down`
`profiler`	tracing (linux/intel_pt)	`instructions`, `cycles`, `tsc`
`analyzer`	disassembling (llvm)	`mca::assembly`, `mca::address`, `mca::encoding`, `mca::size`, `mca::uops`, `mca::latency`, `mca::rthroughput`, `mca::may_load`, `mca::may_store`, `mca::has_side_effects`, ..., `mca::debug::source`
`analyzer`	analyzing (llvm/mca)	`mca::timeline`, `mca::resource_pressure`, `mca::bottleneck`
`runner`	benchmarking	`bench::baseline`, `bench::latency`, `bench::throughput`
`[io]`	logging/plotting (gnuplot/sixel)	`log`, `json`, `report`, `annotate`, `plot` (`hist`, `box`, `bar`, `line`, `ecdf`)

license	namespace	guard	description
MIT	`perf::*`	-	https://opensource.org/license/mit
Apache2+LLVM	`perf::mca::*`	`PERF_LLVM == 1`	https://llvm.org/LICENSE.txt

qlibs/perf

Folders and files

Latest commit

History

Repository files navigation

perf: C++2x Performance library

Overview

Features

Requirements

Showcase

API

User Guide

License

About

Topics

Resources

Uh oh!

Stars

Watchers

Forks

Releases

Uh oh!

Languages

`perf`: C++2x Performance library