From db5f215d47c8f7d2c078e2b93968d3145a14472c Mon Sep 17 00:00:00 2001 From: Joe Richey Date: Sun, 10 Jul 2022 00:20:20 -0700 Subject: [PATCH] Add/Rework benchmarks to track initialization cost This PR adds more benchmarks so we can get and accurate idea about two things: - What is the cost of having to zero the buffer before calling `getrandom`? - What is the performance on aligned, 32-byte buffers? - This is by far the most common use, as its used to seed usersapce CSPRNGs. I ran the benchmarks on my system: - CPU: AMD Ryzen 7 5700G - OS: Linux 5.15.52-1-lts - Rust Version: 1.62.0-nightly (ea92b0838 2022-05-07) I got the following results: ``` test bench_large ... bench: 3,759,323 ns/iter (+/- 177,100) = 557 MB/s test bench_large_init ... bench: 3,821,229 ns/iter (+/- 39,132) = 548 MB/s test bench_page ... bench: 7,281 ns/iter (+/- 59) = 562 MB/s test bench_page_init ... bench: 7,290 ns/iter (+/- 69) = 561 MB/s test bench_seed ... bench: 206 ns/iter (+/- 3) = 155 MB/s test bench_seed_init ... bench: 206 ns/iter (+/- 1) = 155 MB/s ``` These results were very consistent across multiple runs, and roughtly behave as we would expect: - The thoughput is highest with a buffer large enough to amoritize the syscall overhead, but small enough to stay in the L1D cache. - There is a _very_ small cost to zeroing the buffer beforehand. - This cost is imperceptible in the common 32-byte usecase, where the syscall overhead dominates. - The cost is slightly higher (1%) with multi-megabyte buffers as the data gets evicted from the L1 cache between the `memset` and the call to `getrandom`. I would love to see results for other platforms. Could we get someone to run this on an M1 Mac? Signed-off-by: Joe Richey --- benches/mod.rs | 88 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 8 deletions(-) diff --git a/benches/mod.rs b/benches/mod.rs index a93e7206..11be47eb 100644 --- a/benches/mod.rs +++ b/benches/mod.rs @@ -1,22 +1,94 @@ #![feature(test)] extern crate test; -#[bench] -fn bench_64(b: &mut test::Bencher) { - let mut buf = [0u8; 64]; +use std::{ + alloc::{alloc_zeroed, dealloc, Layout}, + ptr::NonNull, +}; + +// AlignedBuffer is like a Box<[u8; N]> except that it is always N-byte aligned +struct AlignedBuffer(NonNull<[u8; N]>); + +impl AlignedBuffer { + fn layout() -> Layout { + Layout::from_size_align(N, N).unwrap() + } + + fn new() -> Self { + let p = unsafe { alloc_zeroed(Self::layout()) } as *mut [u8; N]; + Self(NonNull::new(p).unwrap()) + } + + fn buf(&mut self) -> &mut [u8; N] { + unsafe { self.0.as_mut() } + } +} + +impl Drop for AlignedBuffer { + fn drop(&mut self) { + unsafe { dealloc(self.0.as_ptr() as *mut u8, Self::layout()) } + } +} + +// Used to benchmark the throughput of getrandom in an optimal scenario. +// The buffer is hot, and does not require initialization. +#[inline(always)] +fn bench(b: &mut test::Bencher) { + let mut ab = AlignedBuffer::::new(); + let buf = ab.buf(); b.iter(|| { getrandom::getrandom(&mut buf[..]).unwrap(); test::black_box(&buf); }); - b.bytes = buf.len() as u64; + b.bytes = N as u64; } -#[bench] -fn bench_65536(b: &mut test::Bencher) { - let mut buf = [0u8; 65536]; +// Used to benchmark the throughput of getrandom is a slightly less optimal +// scenario. The buffer is still hot, but requires initialization. +#[inline(always)] +fn bench_with_init(b: &mut test::Bencher) { + let mut ab = AlignedBuffer::::new(); + let buf = ab.buf(); b.iter(|| { + for byte in buf.iter_mut() { + *byte = 0; + } getrandom::getrandom(&mut buf[..]).unwrap(); test::black_box(&buf); }); - b.bytes = buf.len() as u64; + b.bytes = N as u64; +} + +// 32 bytes (256-bit) is the seed sized used for rand::thread_rng +const SEED: usize = 32; +// Common size of a page, 4 KiB +const PAGE: usize = 4096; +// Large buffer to get asymptotic performance, 2 MiB +const LARGE: usize = 1 << 21; + +#[bench] +fn bench_seed(b: &mut test::Bencher) { + bench::(b); +} +#[bench] +fn bench_seed_init(b: &mut test::Bencher) { + bench_with_init::(b); +} + +#[bench] +fn bench_page(b: &mut test::Bencher) { + bench::(b); +} +#[bench] +fn bench_page_init(b: &mut test::Bencher) { + bench_with_init::(b); +} + +#[bench] +fn bench_large(b: &mut test::Bencher) { + bench::(b); +} +#[bench] +fn bench_large_init(b: &mut test::Bencher) { + bench_with_init::(b); }