From db5f215d47c8f7d2c078e2b93968d3145a14472c Mon Sep 17 00:00:00 2001
From: Joe Richey <joerichey@google.com>
Date: Sun, 10 Jul 2022 00:20:20 -0700
Subject: [PATCH] Add/Rework benchmarks to track initialization cost

This PR adds more benchmarks so we can get and accurate idea about two
things:

  - What is the cost of having to zero the buffer before calling
    `getrandom`?
  - What is the performance on aligned, 32-byte buffers?
    - This is by far the most common use, as its used to seed
      usersapce CSPRNGs.

I ran the benchmarks on my system:
  - CPU: AMD Ryzen 7 5700G
  - OS: Linux 5.15.52-1-lts
  - Rust Version: 1.62.0-nightly (ea92b0838 2022-05-07)

I got the following results:
```
test bench_large      ... bench:   3,759,323 ns/iter (+/- 177,100) = 557 MB/s
test bench_large_init ... bench:   3,821,229 ns/iter (+/- 39,132) = 548 MB/s
test bench_page       ... bench:       7,281 ns/iter (+/- 59) = 562 MB/s
test bench_page_init  ... bench:       7,290 ns/iter (+/- 69) = 561 MB/s
test bench_seed       ... bench:         206 ns/iter (+/- 3) = 155 MB/s
test bench_seed_init  ... bench:         206 ns/iter (+/- 1) = 155 MB/s
```

These results were very consistent across multiple runs, and roughtly
behave as we would expect:
  - The thoughput is highest with a buffer large enough to amoritize the
    syscall overhead, but small enough to stay in the L1D cache.
  - There is a _very_ small cost to zeroing the buffer beforehand.
  - This cost is imperceptible in the common 32-byte usecase, where the
    syscall overhead dominates.
  - The cost is slightly higher (1%) with multi-megabyte buffers as the
    data gets evicted from the L1 cache between the `memset` and the
    call to `getrandom`.

I would love to see results for other platforms. Could we get someone to
run this on an M1 Mac?

Signed-off-by: Joe Richey <joerichey@google.com>
---
 benches/mod.rs | 88 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 80 insertions(+), 8 deletions(-)
diff --git a/benches/mod.rs b/benches/mod.rs
index a93e7206..11be47eb 100644
--- a/benches/mod.rs
+++ b/benches/mod.rs
@@ -1,22 +1,94 @@
 #![feature(test)]
 extern crate test;
 
-#[bench]
-fn bench_64(b: &mut test::Bencher) {
-    let mut buf = [0u8; 64];
+use std::{
+    alloc::{alloc_zeroed, dealloc, Layout},
+    ptr::NonNull,
+};
+
+// AlignedBuffer is like a Box<[u8; N]> except that it is always N-byte aligned
+struct AlignedBuffer<const N: usize>(NonNull<[u8; N]>);
+
+impl<const N: usize> AlignedBuffer<N> {
+    fn layout() -> Layout {
+        Layout::from_size_align(N, N).unwrap()
+    }
+
+    fn new() -> Self {
+        let p = unsafe { alloc_zeroed(Self::layout()) } as *mut [u8; N];
+        Self(NonNull::new(p).unwrap())
+    }
+
+    fn buf(&mut self) -> &mut [u8; N] {
+        unsafe { self.0.as_mut() }
+    }
+}
+
+impl<const N: usize> Drop for AlignedBuffer<N> {
+    fn drop(&mut self) {
+        unsafe { dealloc(self.0.as_ptr() as *mut u8, Self::layout()) }
+    }
+}
+
+// Used to benchmark the throughput of getrandom in an optimal scenario.
+// The buffer is hot, and does not require initialization.
+#[inline(always)]
+fn bench<const N: usize>(b: &mut test::Bencher) {
+    let mut ab = AlignedBuffer::<N>::new();
+    let buf = ab.buf();
     b.iter(|| {
         getrandom::getrandom(&mut buf[..]).unwrap();
         test::black_box(&buf);
     });
-    b.bytes = buf.len() as u64;
+    b.bytes = N as u64;
 }
 
-#[bench]
-fn bench_65536(b: &mut test::Bencher) {
-    let mut buf = [0u8; 65536];
+// Used to benchmark the throughput of getrandom is a slightly less optimal
+// scenario. The buffer is still hot, but requires initialization.
+#[inline(always)]
+fn bench_with_init<const N: usize>(b: &mut test::Bencher) {
+    let mut ab = AlignedBuffer::<N>::new();
+    let buf = ab.buf();
     b.iter(|| {
+        for byte in buf.iter_mut() {
+            *byte = 0;
+        }
         getrandom::getrandom(&mut buf[..]).unwrap();
         test::black_box(&buf);
     });
-    b.bytes = buf.len() as u64;
+    b.bytes = N as u64;
+}
+
+// 32 bytes (256-bit) is the seed sized used for rand::thread_rng
+const SEED: usize = 32;
+// Common size of a page, 4 KiB
+const PAGE: usize = 4096;
+// Large buffer to get asymptotic performance, 2 MiB
+const LARGE: usize = 1 << 21;
+
+#[bench]
+fn bench_seed(b: &mut test::Bencher) {
+    bench::<SEED>(b);
+}
+#[bench]
+fn bench_seed_init(b: &mut test::Bencher) {
+    bench_with_init::<SEED>(b);
+}
+
+#[bench]
+fn bench_page(b: &mut test::Bencher) {
+    bench::<PAGE>(b);
+}
+#[bench]
+fn bench_page_init(b: &mut test::Bencher) {
+    bench_with_init::<PAGE>(b);
+}
+
+#[bench]
+fn bench_large(b: &mut test::Bencher) {
+    bench::<LARGE>(b);
+}
+#[bench]
+fn bench_large_init(b: &mut test::Bencher) {
+    bench_with_init::<LARGE>(b);
 }