From 5ceb15b22a58fe08e0fa56aebc7d60cdcccf3d1d Mon Sep 17 00:00:00 2001
From: not-matthias <matthias@codspeed.io>
Date: Tue, 22 Apr 2025 10:08:46 +0200
Subject: [PATCH 1/3] feat(codspeed): add runner ipc via fifo

---
 Cargo.lock                       |  50 +++++--
 Cargo.toml                       |   1 +
 crates/cargo-codspeed/Cargo.toml |   2 +-
 crates/codspeed/Cargo.toml       |   3 +
 crates/codspeed/src/fifo.rs      | 229 +++++++++++++++++++++++++++++++
 crates/codspeed/src/lib.rs       |   2 +
 crates/codspeed/src/shared.rs    |  15 ++
 7 files changed, 291 insertions(+), 11 deletions(-)
 create mode 100644 crates/codspeed/src/fifo.rs
 create mode 100644 crates/codspeed/src/shared.rs

diff --git a/Cargo.lock b/Cargo.lock
index 6d58a51f..ae5e776d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -83,9 +83,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.86"
+version = "1.0.97"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da"
+checksum = "dcfed56ad506cb2c684a14971b8861fdc3baaaae314b9e5f9bb532cbe3ba7a4f"
 
 [[package]]
 name = "approx"
@@ -386,6 +386,15 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7dfdb4953a096c551ce9ace855a604d702e6e62d77fac690575ae347571717f5"
 
+[[package]]
+name = "bincode"
+version = "1.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "bitflags"
 version = "1.3.2"
@@ -511,6 +520,12 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
+[[package]]
+name = "cfg_aliases"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+
 [[package]]
 name = "ciborium"
 version = "0.2.2"
@@ -583,8 +598,11 @@ checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
 name = "codspeed"
 version = "2.10.1"
 dependencies = [
+ "anyhow",
+ "bincode",
  "colored",
  "libc",
+ "nix",
  "serde",
  "serde_json",
  "tempfile",
@@ -1168,9 +1186,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
 
 [[package]]
 name = "libc"
-version = "0.2.155"
+version = "0.2.171"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
+checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6"
 
 [[package]]
 name = "libm"
@@ -1260,6 +1278,18 @@ dependencies = [
  "typenum",
 ]
 
+[[package]]
+name = "nix"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46"
+dependencies = [
+ "bitflags 2.6.0",
+ "cfg-if",
+ "cfg_aliases",
+ "libc",
+]
+
 [[package]]
 name = "normalize-line-endings"
 version = "0.3.0"
@@ -1857,18 +1887,18 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
 
 [[package]]
 name = "thiserror"
-version = "2.0.11"
+version = "2.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc"
+checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "2.0.11"
+version = "2.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2"
+checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1887,9 +1917,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.39.1"
+version = "1.44.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d040ac2b29ab03b09d4129c2f5bbd012a3ac2f79d38ff506a4bf8dd34b0eac8a"
+checksum = "f382da615b842244d4b8738c82ed1275e6c5dd90c459a30941cd07080b06c91a"
 dependencies = [
  "backtrace",
  "pin-project-lite",
diff --git a/Cargo.toml b/Cargo.toml
index 4dc8eff2..89c50f5f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,6 +15,7 @@ members = [
 resolver = "2"
 
 [workspace.dependencies]
+anyhow = "1.0.97"
 itertools = "0.14.0"
 serde = { version = "1.0.217", features = ["derive"] }
 serde_json = "1.0.138"
diff --git a/crates/cargo-codspeed/Cargo.toml b/crates/cargo-codspeed/Cargo.toml
index 5fccd680..e752e8a8 100644
--- a/crates/cargo-codspeed/Cargo.toml
+++ b/crates/cargo-codspeed/Cargo.toml
@@ -21,7 +21,7 @@ keywords = ["codspeed", "benchmark", "cargo"]
 cargo_metadata = "0.19.2"
 clap = { version = "=4.5.17", features = ["derive", "env"] }
 termcolor = "1.4"
-anyhow = "1.0.86"
+anyhow = { workspace = true }
 itertools = { workspace = true }
 anstyle = "1.0.8"
 serde = { workspace = true }
diff --git a/crates/codspeed/Cargo.toml b/crates/codspeed/Cargo.toml
index 4ff93709..350270a6 100644
--- a/crates/codspeed/Cargo.toml
+++ b/crates/codspeed/Cargo.toml
@@ -18,8 +18,11 @@ categories = [
 keywords = ["codspeed", "benchmark"]
 
 [dependencies]
+anyhow = { workspace = true }
+bincode = "1.3.3"
 colored = "2.0.0"
 libc = "^0.2"
+nix = { version = "0.29.0", features = ["fs"] }
 serde = { workspace = true }
 serde_json = { workspace = true }
 uuid = { version = "1.12.1", features = ["v4"] }
diff --git a/crates/codspeed/src/fifo.rs b/crates/codspeed/src/fifo.rs
new file mode 100644
index 00000000..347cb5e9
--- /dev/null
+++ b/crates/codspeed/src/fifo.rs
@@ -0,0 +1,229 @@
+pub use super::shared::*;
+use anyhow::bail;
+use nix::libc::O_NONBLOCK;
+use nix::sys::stat;
+use nix::unistd::{self, unlink};
+use std::fs::{File, OpenOptions};
+use std::io::Read;
+use std::os::unix::fs::OpenOptionsExt;
+use std::path::{Path, PathBuf};
+
+pub struct BenchGuard {
+    ctl_fifo: FifoIpc,
+    ack_fifo: FifoIpc,
+}
+
+impl BenchGuard {
+    pub fn new(ctl_fifo: &str, ack_fifo: &str) -> anyhow::Result<Self> {
+        let mut instance = Self {
+            ctl_fifo: FifoIpc::connect(ctl_fifo)?.with_writer()?,
+            ack_fifo: FifoIpc::connect(ack_fifo)?.with_reader()?,
+        };
+
+        instance.send_cmd(Command::SetIntegration {
+            name: "codspeed-rust".into(),
+            version: env!("CARGO_PKG_VERSION").into(),
+        })?; // FIXME: Just send it once
+        instance.send_cmd(Command::StartBenchmark)?;
+
+        Ok(instance)
+    }
+
+    pub fn new_with_runner_fifo() -> anyhow::Result<Self> {
+        Self::new(RUNNER_CTL_FIFO, RUNNER_ACK_FIFO)
+    }
+
+    fn send_cmd(&mut self, cmd: Command) -> anyhow::Result<()> {
+        self.ctl_fifo.send_cmd(cmd)?;
+        self.ack_fifo.wait_for_ack();
+
+        Ok(())
+    }
+}
+
+impl Drop for BenchGuard {
+    fn drop(&mut self) {
+        self.send_cmd(Command::StopBenchmark)
+            .expect("Failed to send stop command");
+    }
+}
+
+pub fn send_cmd(cmd: Command) -> anyhow::Result<()> {
+    let mut writer = FifoIpc::connect(RUNNER_CTL_FIFO)?.with_writer()?;
+    writer.send_cmd(cmd).unwrap();
+
+    let mut reader = FifoIpc::connect(RUNNER_ACK_FIFO)?.with_reader()?;
+    reader.wait_for_ack();
+
+    Ok(())
+}
+
+pub struct FifoIpc {
+    path: PathBuf,
+    reader: Option<File>,
+    writer: Option<File>,
+}
+
+impl FifoIpc {
+    /// Creates a new FIFO at the specified path and connects to it.
+    ///
+    /// ```rust
+    /// use codspeed::fifo::{FifoIpc, Command};
+    ///
+    /// // Create the reader before the writer (required!):
+    /// let mut read_fifo = FifoIpc::create("/tmp/doctest.fifo").unwrap().with_reader().unwrap();
+    ///
+    /// // Connect to the FIFO and send a command
+    /// let mut fifo = FifoIpc::connect("/tmp/doctest.fifo").unwrap().with_writer().unwrap();
+    /// fifo.send_cmd(Command::StartBenchmark).unwrap();
+    ///
+    /// // Receive the command in the reader
+    /// let cmd = read_fifo.recv_cmd().unwrap();
+    /// assert_eq!(cmd, Command::StartBenchmark);
+    /// ```
+    pub fn create<P: AsRef<Path>>(path: P) -> anyhow::Result<Self> {
+        // Remove the previous FIFO (if it exists)
+        let _ = unlink(path.as_ref());
+
+        // Create the FIFO with RWX permissions for the owner
+        unistd::mkfifo(path.as_ref(), stat::Mode::S_IRWXU)?;
+
+        Self::connect(path.as_ref())
+    }
+
+    pub fn connect<P: Into<PathBuf>>(path: P) -> anyhow::Result<Self> {
+        let path = path.into();
+
+        if !path.exists() {
+            bail!("FIFO does not exist: {}", path.display());
+        }
+
+        Ok(Self {
+            path,
+            reader: None,
+            writer: None,
+        })
+    }
+
+    pub fn with_reader(mut self) -> anyhow::Result<Self> {
+        self.reader = Some(
+            OpenOptions::new()
+                .write(true)
+                .read(true)
+                .custom_flags(O_NONBLOCK)
+                .open(&self.path)?,
+        );
+        Ok(self)
+    }
+
+    /// WARNING: Writer must be opened _AFTER_ the reader.
+    pub fn with_writer(mut self) -> anyhow::Result<Self> {
+        self.writer = Some(
+            OpenOptions::new()
+                .write(true)
+                .custom_flags(O_NONBLOCK)
+                .open(&self.path)?,
+        );
+        Ok(self)
+    }
+
+    pub fn recv_cmd(&mut self) -> anyhow::Result<Command> {
+        // First read the length (u32 = 4 bytes)
+        let mut len_buffer = [0u8; 4];
+        self.read_exact(&mut len_buffer)?;
+        let message_len = u32::from_le_bytes(len_buffer) as usize;
+
+        // Try to read the message
+        let mut buffer = vec![0u8; message_len];
+        loop {
+            if self.read_exact(&mut buffer).is_ok() {
+                break;
+            }
+        }
+
+        let decoded = bincode::deserialize(&buffer)?;
+        Ok(decoded)
+    }
+
+    pub fn send_cmd(&mut self, cmd: Command) -> anyhow::Result<()> {
+        use std::io::Write;
+
+        let encoded = bincode::serialize(&cmd)?;
+        self.write_all(&(encoded.len() as u32).to_le_bytes())?;
+        self.write_all(&encoded)?;
+        Ok(())
+    }
+
+    pub fn wait_for_ack(&mut self) {
+        loop {
+            if let Ok(Command::Ack) = self.recv_cmd() {
+                break;
+            }
+        }
+    }
+}
+
+impl std::io::Write for FifoIpc {
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        if let Some(writer) = self.writer.as_mut() {
+            writer.write(buf)
+        } else {
+            Err(std::io::Error::new(
+                std::io::ErrorKind::NotConnected,
+                "Writer not initialized",
+            ))
+        }
+    }
+
+    fn flush(&mut self) -> std::io::Result<()> {
+        Ok(())
+    }
+}
+
+impl std::io::Read for FifoIpc {
+    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
+        if let Some(reader) = self.reader.as_mut() {
+            reader.read(buf)
+        } else {
+            Err(std::io::Error::new(
+                std::io::ErrorKind::NotConnected,
+                "Reader not initialized",
+            ))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::io::Write;
+
+    #[test]
+    fn test_ipc_write_read() {
+        let mut fifo = FifoIpc::create("/tmp/test1.fifo")
+            .unwrap()
+            .with_reader()
+            .unwrap()
+            .with_writer()
+            .unwrap();
+
+        fifo.write_all(b"Hello").unwrap();
+        let mut buffer = [0; 5];
+        fifo.read_exact(&mut buffer).unwrap();
+        assert_eq!(&buffer, b"Hello");
+    }
+
+    #[test]
+    fn test_ipc_send_recv_cmd() {
+        let mut fifo = FifoIpc::create("/tmp/test2.fifo")
+            .unwrap()
+            .with_reader()
+            .unwrap()
+            .with_writer()
+            .unwrap();
+
+        fifo.send_cmd(Command::StartBenchmark).unwrap();
+        let cmd = fifo.recv_cmd().unwrap();
+        assert_eq!(cmd, Command::StartBenchmark);
+    }
+}
diff --git a/crates/codspeed/src/lib.rs b/crates/codspeed/src/lib.rs
index bbac6486..87ad3453 100644
--- a/crates/codspeed/src/lib.rs
+++ b/crates/codspeed/src/lib.rs
@@ -1,6 +1,8 @@
 pub mod codspeed;
+pub mod fifo;
 mod macros;
 mod measurement;
 mod request;
+mod shared;
 pub mod utils;
 pub mod walltime;
diff --git a/crates/codspeed/src/shared.rs b/crates/codspeed/src/shared.rs
new file mode 100644
index 00000000..86c1433e
--- /dev/null
+++ b/crates/codspeed/src/shared.rs
@@ -0,0 +1,15 @@
+//! WARNING: Has to be in sync with `runner`.
+
+pub const RUNNER_CTL_FIFO: &str = "/tmp/runner.ctl.fifo";
+pub const RUNNER_ACK_FIFO: &str = "/tmp/runner.ack.fifo";
+
+#[derive(serde::Serialize, serde::Deserialize, Debug, PartialEq)]
+pub enum Command {
+    CurrentBenchmark { pid: u32, uri: String },
+    StartBenchmark,
+    StopBenchmark,
+    Ack,
+    PingPerf,
+    SetIntegration { name: String, version: String },
+    Err,
+}

From 333eee4dd6331092e43835356557c228dc87aa67 Mon Sep 17 00:00:00 2001
From: not-matthias <matthias@codspeed.io>
Date: Tue, 22 Apr 2025 10:10:56 +0200
Subject: [PATCH 2/3] feat(divan_compat): add root frame and ipc with runner

---
 crates/divan_compat/divan_fork/src/bench/mod.rs   |  2 ++
 crates/divan_compat/divan_fork/src/divan.rs       |  7 +++++++
 crates/divan_compat/divan_fork/src/thread_pool.rs | 12 ++++++++++--
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/crates/divan_compat/divan_fork/src/bench/mod.rs b/crates/divan_compat/divan_fork/src/bench/mod.rs
index a8e730b8..0234babf 100644
--- a/crates/divan_compat/divan_fork/src/bench/mod.rs
+++ b/crates/divan_compat/divan_fork/src/bench/mod.rs
@@ -657,6 +657,7 @@ impl<'a> BenchContext<'a> {
 
         let bench_overheads = timer.bench_overheads();
 
+        let _guard = codspeed::fifo::BenchGuard::new_with_runner_fifo();
         while {
             // Conditions for when sampling is over:
             if elapsed_picos >= max_picos {
@@ -810,6 +811,7 @@ impl<'a> BenchContext<'a> {
                 elapsed_picos = elapsed_picos.saturating_add(progress_picos);
             }
         }
+        core::mem::drop(_guard);
 
         // Reset flag for ignoring allocations.
         crate::alloc::IGNORE_ALLOC.set(false);
diff --git a/crates/divan_compat/divan_fork/src/divan.rs b/crates/divan_compat/divan_fork/src/divan.rs
index 942a0e5c..3fe6688e 100644
--- a/crates/divan_compat/divan_fork/src/divan.rs
+++ b/crates/divan_compat/divan_fork/src/divan.rs
@@ -428,6 +428,13 @@ mod codspeed {
             bench_context.samples.time_samples.iter().map(|s| s.duration.picos / 1_000).collect();
         let max_time_ns = bench_context.options.max_time.map(|t| t.as_nanos());
 
+        if let Err(error) = ::codspeed::fifo::send_cmd(codspeed::fifo::Command::CurrentBenchmark {
+            pid: std::process::id(),
+            uri: uri.clone(),
+        }) {
+            eprintln!("Failed to send benchmark URI to runner: {}", error);
+        }
+
         ::codspeed::walltime::collect_raw_walltime_results(
             "divan",
             bench_name,
diff --git a/crates/divan_compat/divan_fork/src/thread_pool.rs b/crates/divan_compat/divan_fork/src/thread_pool.rs
index c607a936..4ec6118c 100644
--- a/crates/divan_compat/divan_fork/src/thread_pool.rs
+++ b/crates/divan_compat/divan_fork/src/thread_pool.rs
@@ -201,9 +201,17 @@ impl<F> TaskShared<F> {
         where
             F: Fn(usize),
         {
-            let task_fn = &(*task.cast::<TaskShared<F>>()).task_fn;
+            #[inline(never)]
+            unsafe fn __codspeed_root_frame__<F>(task: *const TaskShared<()>, thread: usize)
+            where
+                F: Fn(usize),
+            {
+                let task_fn = &(*task.cast::<TaskShared<F>>()).task_fn;
+
+                task_fn(thread);
+            }
 
-            task_fn(thread);
+            __codspeed_root_frame__::<F>(task, thread);
         }
 
         Self {

From 91145111f25b47462b6504962d3249c953fcc718 Mon Sep 17 00:00:00 2001
From: not-matthias <matthias@codspeed.io>
Date: Tue, 22 Apr 2025 10:12:45 +0200
Subject: [PATCH 3/3] feat(criterion_compat): add root frame and ipc with
 runner

---
 .../criterion_fork/src/analysis/mod.rs        |    7 +
 .../criterion_fork/src/bencher.rs             | 1626 +++++++++--------
 .../criterion_fork/src/routine.rs             |    6 +-
 3 files changed, 873 insertions(+), 766 deletions(-)

diff --git a/crates/criterion_compat/criterion_fork/src/analysis/mod.rs b/crates/criterion_compat/criterion_fork/src/analysis/mod.rs
index bddd90c1..d2431403 100644
--- a/crates/criterion_compat/criterion_fork/src/analysis/mod.rs
+++ b/crates/criterion_compat/criterion_fork/src/analysis/mod.rs
@@ -297,6 +297,13 @@ mod codspeed {
     ) {
         let (uri, bench_name) = create_uri_and_name(id, c);
 
+        if let Err(error) = ::codspeed::fifo::send_cmd(codspeed::fifo::Command::CurrentBenchmark {
+            pid: std::process::id(),
+            uri: uri.clone(),
+        }) {
+            eprintln!("Failed to send benchmark URI to runner: {}", error);
+        }
+
         let avg_iter_per_round = iters.iter().sum::<f64>() / iters.len() as f64;
         let max_time_ns = Some(c.config.measurement_time.as_nanos());
         let times_ns = avg_times.iter().map(|t| *t as u128).collect();
diff --git a/crates/criterion_compat/criterion_fork/src/bencher.rs b/crates/criterion_compat/criterion_fork/src/bencher.rs
index 016aa284..198b5ccb 100644
--- a/crates/criterion_compat/criterion_fork/src/bencher.rs
+++ b/crates/criterion_compat/criterion_fork/src/bencher.rs
@@ -1,764 +1,862 @@
-use std::iter::IntoIterator;
-use std::time::Duration;
-use std::time::Instant;
-
-use crate::black_box;
-use crate::measurement::{Measurement, WallTime};
-use crate::BatchSize;
-
-#[cfg(feature = "async")]
-use std::future::Future;
-
-#[cfg(feature = "async")]
-use crate::async_executor::AsyncExecutor;
-
-// ================================== MAINTENANCE NOTE =============================================
-// Any changes made to either Bencher or AsyncBencher will have to be replicated to the other!
-// ================================== MAINTENANCE NOTE =============================================
-
-/// Timer struct used to iterate a benchmarked function and measure the runtime.
-///
-/// This struct provides different timing loops as methods. Each timing loop provides a different
-/// way to time a routine and each has advantages and disadvantages.
-///
-/// * If you want to do the iteration and measurement yourself (eg. passing the iteration count
-///   to a separate process), use `iter_custom`.
-/// * If your routine requires no per-iteration setup and returns a value with an expensive `drop`
-///   method, use `iter_with_large_drop`.
-/// * If your routine requires some per-iteration setup that shouldn't be timed, use `iter_batched`
-///   or `iter_batched_ref`. See [`BatchSize`](enum.BatchSize.html) for a discussion of batch sizes.
-///   If the setup value implements `Drop` and you don't want to include the `drop` time in the
-///   measurement, use `iter_batched_ref`, otherwise use `iter_batched`. These methods are also
-///   suitable for benchmarking routines which return a value with an expensive `drop` method,
-///   but are more complex than `iter_with_large_drop`.
-/// * Otherwise, use `iter`.
-pub struct Bencher<'a, M: Measurement = WallTime> {
-    pub(crate) iterated: bool,         // Have we iterated this benchmark?
-    pub(crate) iters: u64,             // Number of times to iterate this benchmark
-    pub(crate) value: M::Value,        // The measured value
-    pub(crate) measurement: &'a M,     // Reference to the measurement object
-    pub(crate) elapsed_time: Duration, // How much time did it take to perform the iteration? Used for the warmup period.
-}
-impl<'a, M: Measurement> Bencher<'a, M> {
-    /// Times a `routine` by executing it many times and timing the total elapsed time.
-    ///
-    /// Prefer this timing loop when `routine` returns a value that doesn't have a destructor.
-    ///
-    /// # Timing model
-    ///
-    /// Note that the `Bencher` also times the time required to destroy the output of `routine()`.
-    /// Therefore prefer this timing loop when the runtime of `mem::drop(O)` is negligible compared
-    /// to the runtime of the `routine`.
-    ///
-    /// ```text
-    /// elapsed = Instant::now + iters * (routine + mem::drop(O) + Range::next)
-    /// ```
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    ///
-    /// use criterion::*;
-    ///
-    /// // The function to benchmark
-    /// fn foo() {
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     c.bench_function("iter", move |b| {
-    ///         b.iter(|| foo())
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    #[inline(never)]
-    pub fn iter<O, R>(&mut self, mut routine: R)
-    where
-        R: FnMut() -> O,
-    {
-        self.iterated = true;
-        let time_start = Instant::now();
-        let start = self.measurement.start();
-        for _ in 0..self.iters {
-            black_box(routine());
-        }
-        self.value = self.measurement.end(start);
-        self.elapsed_time = time_start.elapsed();
-    }
-
-    /// Times a `routine` by executing it many times and relying on `routine` to measure its own execution time.
-    ///
-    /// Prefer this timing loop in cases where `routine` has to do its own measurements to
-    /// get accurate timing information (for example in multi-threaded scenarios where you spawn
-    /// and coordinate with multiple threads).
-    ///
-    /// # Timing model
-    /// Custom, the timing model is whatever is returned as the Duration from `routine`.
-    ///
-    /// # Example
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    /// use criterion::*;
-    /// use criterion::black_box;
-    /// use std::time::Instant;
-    ///
-    /// fn foo() {
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     c.bench_function("iter", move |b| {
-    ///         b.iter_custom(|iters| {
-    ///             let start = Instant::now();
-    ///             for _i in 0..iters {
-    ///                 black_box(foo());
-    ///             }
-    ///             start.elapsed()
-    ///         })
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    #[inline(never)]
-    pub fn iter_custom<R>(&mut self, mut routine: R)
-    where
-        R: FnMut(u64) -> M::Value,
-    {
-        self.iterated = true;
-        let time_start = Instant::now();
-        self.value = routine(self.iters);
-        self.elapsed_time = time_start.elapsed();
-    }
-
-    #[doc(hidden)]
-    pub fn iter_with_setup<I, O, S, R>(&mut self, setup: S, routine: R)
-    where
-        S: FnMut() -> I,
-        R: FnMut(I) -> O,
-    {
-        self.iter_batched(setup, routine, BatchSize::PerIteration);
-    }
-
-    /// Times a `routine` by collecting its output on each iteration. This avoids timing the
-    /// destructor of the value returned by `routine`.
-    ///
-    /// WARNING: This requires `O(iters * mem::size_of::<O>())` of memory, and `iters` is not under the
-    /// control of the caller. If this causes out-of-memory errors, use `iter_batched` instead.
-    ///
-    /// # Timing model
-    ///
-    /// ``` text
-    /// elapsed = Instant::now + iters * (routine) + Iterator::collect::<Vec<_>>
-    /// ```
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    ///
-    /// use criterion::*;
-    ///
-    /// fn create_vector() -> Vec<u64> {
-    ///     # vec![]
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     c.bench_function("with_drop", move |b| {
-    ///         // This will avoid timing the Vec::drop.
-    ///         b.iter_with_large_drop(|| create_vector())
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    pub fn iter_with_large_drop<O, R>(&mut self, mut routine: R)
-    where
-        R: FnMut() -> O,
-    {
-        self.iter_batched(|| (), |_| routine(), BatchSize::SmallInput);
-    }
-
-    /// Times a `routine` that requires some input by generating a batch of input, then timing the
-    /// iteration of the benchmark over the input. See [`BatchSize`](enum.BatchSize.html) for
-    /// details on choosing the batch size. Use this when the routine must consume its input.
-    ///
-    /// For example, use this loop to benchmark sorting algorithms, because they require unsorted
-    /// data on each iteration.
-    ///
-    /// # Timing model
-    ///
-    /// ```text
-    /// elapsed = (Instant::now * num_batches) + (iters * (routine + O::drop)) + Vec::extend
-    /// ```
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    ///
-    /// use criterion::*;
-    ///
-    /// fn create_scrambled_data() -> Vec<u64> {
-    ///     # vec![]
-    ///     // ...
-    /// }
-    ///
-    /// // The sorting algorithm to test
-    /// fn sort(data: &mut [u64]) {
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     let data = create_scrambled_data();
-    ///
-    ///     c.bench_function("with_setup", move |b| {
-    ///         // This will avoid timing the to_vec call.
-    ///         b.iter_batched(|| data.clone(), |mut data| sort(&mut data), BatchSize::SmallInput)
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    #[inline(never)]
-    pub fn iter_batched<I, O, S, R>(&mut self, mut setup: S, mut routine: R, size: BatchSize)
-    where
-        S: FnMut() -> I,
-        R: FnMut(I) -> O,
-    {
-        self.iterated = true;
-        let batch_size = size.iters_per_batch(self.iters);
-        assert!(batch_size != 0, "Batch size must not be zero.");
-        let time_start = Instant::now();
-        self.value = self.measurement.zero();
-
-        if batch_size == 1 {
-            for _ in 0..self.iters {
-                let input = black_box(setup());
-
-                let start = self.measurement.start();
-                let output = routine(input);
-                let end = self.measurement.end(start);
-                self.value = self.measurement.add(&self.value, &end);
-
-                drop(black_box(output));
-            }
-        } else {
-            let mut iteration_counter = 0;
-
-            while iteration_counter < self.iters {
-                let batch_size = ::std::cmp::min(batch_size, self.iters - iteration_counter);
-
-                let inputs = black_box((0..batch_size).map(|_| setup()).collect::<Vec<_>>());
-                let mut outputs = Vec::with_capacity(batch_size as usize);
-
-                let start = self.measurement.start();
-                outputs.extend(inputs.into_iter().map(&mut routine));
-                let end = self.measurement.end(start);
-                self.value = self.measurement.add(&self.value, &end);
-
-                black_box(outputs);
-
-                iteration_counter += batch_size;
-            }
-        }
-
-        self.elapsed_time = time_start.elapsed();
-    }
-
-    /// Times a `routine` that requires some input by generating a batch of input, then timing the
-    /// iteration of the benchmark over the input. See [`BatchSize`](enum.BatchSize.html) for
-    /// details on choosing the batch size. Use this when the routine should accept the input by
-    /// mutable reference.
-    ///
-    /// For example, use this loop to benchmark sorting algorithms, because they require unsorted
-    /// data on each iteration.
-    ///
-    /// # Timing model
-    ///
-    /// ```text
-    /// elapsed = (Instant::now * num_batches) + (iters * routine) + Vec::extend
-    /// ```
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    ///
-    /// use criterion::*;
-    ///
-    /// fn create_scrambled_data() -> Vec<u64> {
-    ///     # vec![]
-    ///     // ...
-    /// }
-    ///
-    /// // The sorting algorithm to test
-    /// fn sort(data: &mut [u64]) {
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     let data = create_scrambled_data();
-    ///
-    ///     c.bench_function("with_setup", move |b| {
-    ///         // This will avoid timing the to_vec call.
-    ///         b.iter_batched(|| data.clone(), |mut data| sort(&mut data), BatchSize::SmallInput)
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    #[inline(never)]
-    pub fn iter_batched_ref<I, O, S, R>(&mut self, mut setup: S, mut routine: R, size: BatchSize)
-    where
-        S: FnMut() -> I,
-        R: FnMut(&mut I) -> O,
-    {
-        self.iterated = true;
-        let batch_size = size.iters_per_batch(self.iters);
-        assert!(batch_size != 0, "Batch size must not be zero.");
-        let time_start = Instant::now();
-        self.value = self.measurement.zero();
-
-        if batch_size == 1 {
-            for _ in 0..self.iters {
-                let mut input = black_box(setup());
-
-                let start = self.measurement.start();
-                let output = routine(&mut input);
-                let end = self.measurement.end(start);
-                self.value = self.measurement.add(&self.value, &end);
-
-                drop(black_box(output));
-                drop(black_box(input));
-            }
-        } else {
-            let mut iteration_counter = 0;
-
-            while iteration_counter < self.iters {
-                let batch_size = ::std::cmp::min(batch_size, self.iters - iteration_counter);
-
-                let mut inputs = black_box((0..batch_size).map(|_| setup()).collect::<Vec<_>>());
-                let mut outputs = Vec::with_capacity(batch_size as usize);
-
-                let start = self.measurement.start();
-                outputs.extend(inputs.iter_mut().map(&mut routine));
-                let end = self.measurement.end(start);
-                self.value = self.measurement.add(&self.value, &end);
-
-                black_box(outputs);
-
-                iteration_counter += batch_size;
-            }
-        }
-        self.elapsed_time = time_start.elapsed();
-    }
-
-    // Benchmarks must actually call one of the iter methods. This causes benchmarks to fail loudly
-    // if they don't.
-    pub(crate) fn assert_iterated(&mut self) {
-        assert!(
-            self.iterated,
-            "Benchmark function must call Bencher::iter or related method."
-        );
-        self.iterated = false;
-    }
-
-    /// Convert this bencher into an AsyncBencher, which enables async/await support.
-    #[cfg(feature = "async")]
-    pub fn to_async<'b, A: AsyncExecutor>(&'b mut self, runner: A) -> AsyncBencher<'a, 'b, A, M> {
-        AsyncBencher { b: self, runner }
-    }
-}
-
-/// Async/await variant of the Bencher struct.
-#[cfg(feature = "async")]
-pub struct AsyncBencher<'a, 'b, A: AsyncExecutor, M: Measurement = WallTime> {
-    b: &'b mut Bencher<'a, M>,
-    runner: A,
-}
-#[cfg(feature = "async")]
-impl<'a, 'b, A: AsyncExecutor, M: Measurement> AsyncBencher<'a, 'b, A, M> {
-    /// Times a `routine` by executing it many times and timing the total elapsed time.
-    ///
-    /// Prefer this timing loop when `routine` returns a value that doesn't have a destructor.
-    ///
-    /// # Timing model
-    ///
-    /// Note that the `AsyncBencher` also times the time required to destroy the output of `routine()`.
-    /// Therefore prefer this timing loop when the runtime of `mem::drop(O)` is negligible compared
-    /// to the runtime of the `routine`.
-    ///
-    /// ```text
-    /// elapsed = Instant::now + iters * (routine + mem::drop(O) + Range::next)
-    /// ```
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    ///
-    /// use criterion::*;
-    /// use criterion::async_executor::FuturesExecutor;
-    ///
-    /// // The function to benchmark
-    /// async fn foo() {
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     c.bench_function("iter", move |b| {
-    ///         b.to_async(FuturesExecutor).iter(|| async { foo().await } )
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    #[inline(never)]
-    pub fn iter<O, R, F>(&mut self, mut routine: R)
-    where
-        R: FnMut() -> F,
-        F: Future<Output = O>,
-    {
-        let AsyncBencher { b, runner } = self;
-        runner.block_on(async {
-            b.iterated = true;
-            let time_start = Instant::now();
-            let start = b.measurement.start();
-            for _ in 0..b.iters {
-                black_box(routine().await);
-            }
-            b.value = b.measurement.end(start);
-            b.elapsed_time = time_start.elapsed();
-        });
-    }
-
-    /// Times a `routine` by executing it many times and relying on `routine` to measure its own execution time.
-    ///
-    /// Prefer this timing loop in cases where `routine` has to do its own measurements to
-    /// get accurate timing information (for example in multi-threaded scenarios where you spawn
-    /// and coordinate with multiple threads).
-    ///
-    /// # Timing model
-    /// Custom, the timing model is whatever is returned as the Duration from `routine`.
-    ///
-    /// # Example
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    /// use criterion::*;
-    /// use criterion::black_box;
-    /// use criterion::async_executor::FuturesExecutor;
-    /// use std::time::Instant;
-    ///
-    /// async fn foo() {
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     c.bench_function("iter", move |b| {
-    ///         b.to_async(FuturesExecutor).iter_custom(|iters| {
-    ///             async move {
-    ///                 let start = Instant::now();
-    ///                 for _i in 0..iters {
-    ///                     black_box(foo().await);
-    ///                 }
-    ///                 start.elapsed()
-    ///             }
-    ///         })
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    #[inline(never)]
-    pub fn iter_custom<R, F>(&mut self, mut routine: R)
-    where
-        R: FnMut(u64) -> F,
-        F: Future<Output = M::Value>,
-    {
-        let AsyncBencher { b, runner } = self;
-        runner.block_on(async {
-            b.iterated = true;
-            let time_start = Instant::now();
-            b.value = routine(b.iters).await;
-            b.elapsed_time = time_start.elapsed();
-        })
-    }
-
-    #[doc(hidden)]
-    pub fn iter_with_setup<I, O, S, R, F>(&mut self, setup: S, routine: R)
-    where
-        S: FnMut() -> I,
-        R: FnMut(I) -> F,
-        F: Future<Output = O>,
-    {
-        self.iter_batched(setup, routine, BatchSize::PerIteration);
-    }
-
-    /// Times a `routine` by collecting its output on each iteration. This avoids timing the
-    /// destructor of the value returned by `routine`.
-    ///
-    /// WARNING: This requires `O(iters * mem::size_of::<O>())` of memory, and `iters` is not under the
-    /// control of the caller. If this causes out-of-memory errors, use `iter_batched` instead.
-    ///
-    /// # Timing model
-    ///
-    /// ``` text
-    /// elapsed = Instant::now + iters * (routine) + Iterator::collect::<Vec<_>>
-    /// ```
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    ///
-    /// use criterion::*;
-    /// use criterion::async_executor::FuturesExecutor;
-    ///
-    /// async fn create_vector() -> Vec<u64> {
-    ///     # vec![]
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     c.bench_function("with_drop", move |b| {
-    ///         // This will avoid timing the Vec::drop.
-    ///         b.to_async(FuturesExecutor).iter_with_large_drop(|| async { create_vector().await })
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    pub fn iter_with_large_drop<O, R, F>(&mut self, mut routine: R)
-    where
-        R: FnMut() -> F,
-        F: Future<Output = O>,
-    {
-        self.iter_batched(|| (), |_| routine(), BatchSize::SmallInput);
-    }
-
-    #[doc(hidden)]
-    pub fn iter_with_large_setup<I, O, S, R, F>(&mut self, setup: S, routine: R)
-    where
-        S: FnMut() -> I,
-        R: FnMut(I) -> F,
-        F: Future<Output = O>,
-    {
-        self.iter_batched(setup, routine, BatchSize::NumBatches(1));
-    }
-
-    /// Times a `routine` that requires some input by generating a batch of input, then timing the
-    /// iteration of the benchmark over the input. See [`BatchSize`](enum.BatchSize.html) for
-    /// details on choosing the batch size. Use this when the routine must consume its input.
-    ///
-    /// For example, use this loop to benchmark sorting algorithms, because they require unsorted
-    /// data on each iteration.
-    ///
-    /// # Timing model
-    ///
-    /// ```text
-    /// elapsed = (Instant::now * num_batches) + (iters * (routine + O::drop)) + Vec::extend
-    /// ```
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    ///
-    /// use criterion::*;
-    /// use criterion::async_executor::FuturesExecutor;
-    ///
-    /// fn create_scrambled_data() -> Vec<u64> {
-    ///     # vec![]
-    ///     // ...
-    /// }
-    ///
-    /// // The sorting algorithm to test
-    /// async fn sort(data: &mut [u64]) {
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     let data = create_scrambled_data();
-    ///
-    ///     c.bench_function("with_setup", move |b| {
-    ///         // This will avoid timing the to_vec call.
-    ///         b.iter_batched(|| data.clone(), |mut data| async move { sort(&mut data).await }, BatchSize::SmallInput)
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    #[inline(never)]
-    pub fn iter_batched<I, O, S, R, F>(&mut self, mut setup: S, mut routine: R, size: BatchSize)
-    where
-        S: FnMut() -> I,
-        R: FnMut(I) -> F,
-        F: Future<Output = O>,
-    {
-        let AsyncBencher { b, runner } = self;
-        runner.block_on(async {
-            b.iterated = true;
-            let batch_size = size.iters_per_batch(b.iters);
-            assert!(batch_size != 0, "Batch size must not be zero.");
-            let time_start = Instant::now();
-            b.value = b.measurement.zero();
-
-            if batch_size == 1 {
-                for _ in 0..b.iters {
-                    let input = black_box(setup());
-
-                    let start = b.measurement.start();
-                    let output = routine(input).await;
-                    let end = b.measurement.end(start);
-                    b.value = b.measurement.add(&b.value, &end);
-
-                    drop(black_box(output));
-                }
-            } else {
-                let mut iteration_counter = 0;
-
-                while iteration_counter < b.iters {
-                    let batch_size = ::std::cmp::min(batch_size, b.iters - iteration_counter);
-
-                    let inputs = black_box((0..batch_size).map(|_| setup()).collect::<Vec<_>>());
-                    let mut outputs = Vec::with_capacity(batch_size as usize);
-
-                    let start = b.measurement.start();
-                    // Can't use .extend here like the sync version does
-                    for input in inputs {
-                        outputs.push(routine(input).await);
-                    }
-                    let end = b.measurement.end(start);
-                    b.value = b.measurement.add(&b.value, &end);
-
-                    black_box(outputs);
-
-                    iteration_counter += batch_size;
-                }
-            }
-
-            b.elapsed_time = time_start.elapsed();
-        })
-    }
-
-    /// Times a `routine` that requires some input by generating a batch of input, then timing the
-    /// iteration of the benchmark over the input. See [`BatchSize`](enum.BatchSize.html) for
-    /// details on choosing the batch size. Use this when the routine should accept the input by
-    /// mutable reference.
-    ///
-    /// For example, use this loop to benchmark sorting algorithms, because they require unsorted
-    /// data on each iteration.
-    ///
-    /// # Timing model
-    ///
-    /// ```text
-    /// elapsed = (Instant::now * num_batches) + (iters * routine) + Vec::extend
-    /// ```
-    ///
-    /// # Example
-    ///
-    /// ```rust
-    /// #[macro_use] extern crate criterion;
-    ///
-    /// use criterion::*;
-    /// use criterion::async_executor::FuturesExecutor;
-    ///
-    /// fn create_scrambled_data() -> Vec<u64> {
-    ///     # vec![]
-    ///     // ...
-    /// }
-    ///
-    /// // The sorting algorithm to test
-    /// async fn sort(data: &mut [u64]) {
-    ///     // ...
-    /// }
-    ///
-    /// fn bench(c: &mut Criterion) {
-    ///     let data = create_scrambled_data();
-    ///
-    ///     c.bench_function("with_setup", move |b| {
-    ///         // This will avoid timing the to_vec call.
-    ///         b.iter_batched(|| data.clone(), |mut data| async move { sort(&mut data).await }, BatchSize::SmallInput)
-    ///     });
-    /// }
-    ///
-    /// criterion_group!(benches, bench);
-    /// criterion_main!(benches);
-    /// ```
-    ///
-    #[inline(never)]
-    pub fn iter_batched_ref<I, O, S, R, F>(&mut self, mut setup: S, mut routine: R, size: BatchSize)
-    where
-        S: FnMut() -> I,
-        R: FnMut(&mut I) -> F,
-        F: Future<Output = O>,
-    {
-        let AsyncBencher { b, runner } = self;
-        runner.block_on(async {
-            b.iterated = true;
-            let batch_size = size.iters_per_batch(b.iters);
-            assert!(batch_size != 0, "Batch size must not be zero.");
-            let time_start = Instant::now();
-            b.value = b.measurement.zero();
-
-            if batch_size == 1 {
-                for _ in 0..b.iters {
-                    let mut input = black_box(setup());
-
-                    let start = b.measurement.start();
-                    let output = routine(&mut input).await;
-                    let end = b.measurement.end(start);
-                    b.value = b.measurement.add(&b.value, &end);
-
-                    drop(black_box(output));
-                    drop(black_box(input));
-                }
-            } else {
-                let mut iteration_counter = 0;
-
-                while iteration_counter < b.iters {
-                    let batch_size = ::std::cmp::min(batch_size, b.iters - iteration_counter);
-
-                    let inputs = black_box((0..batch_size).map(|_| setup()).collect::<Vec<_>>());
-                    let mut outputs = Vec::with_capacity(batch_size as usize);
-
-                    let start = b.measurement.start();
-                    // Can't use .extend here like the sync version does
-                    for mut input in inputs {
-                        outputs.push(routine(&mut input).await);
-                    }
-                    let end = b.measurement.end(start);
-                    b.value = b.measurement.add(&b.value, &end);
-
-                    black_box(outputs);
-
-                    iteration_counter += batch_size;
-                }
-            }
-            b.elapsed_time = time_start.elapsed();
-        });
-    }
-}
+#![allow(unused_mut)]
+
+use std::iter::IntoIterator;
+use std::time::Duration;
+use std::time::Instant;
+
+use crate::black_box;
+use crate::measurement::{Measurement, WallTime};
+use crate::BatchSize;
+
+#[cfg(feature = "async")]
+use std::future::Future;
+
+#[cfg(feature = "async")]
+use crate::async_executor::AsyncExecutor;
+
+// ================================== MAINTENANCE NOTE =============================================
+// Any changes made to either Bencher or AsyncBencher will have to be replicated to the other!
+// ================================== MAINTENANCE NOTE =============================================
+
+/// Timer struct used to iterate a benchmarked function and measure the runtime.
+///
+/// This struct provides different timing loops as methods. Each timing loop provides a different
+/// way to time a routine and each has advantages and disadvantages.
+///
+/// * If you want to do the iteration and measurement yourself (eg. passing the iteration count
+///   to a separate process), use `iter_custom`.
+/// * If your routine requires no per-iteration setup and returns a value with an expensive `drop`
+///   method, use `iter_with_large_drop`.
+/// * If your routine requires some per-iteration setup that shouldn't be timed, use `iter_batched`
+///   or `iter_batched_ref`. See [`BatchSize`](enum.BatchSize.html) for a discussion of batch sizes.
+///   If the setup value implements `Drop` and you don't want to include the `drop` time in the
+///   measurement, use `iter_batched_ref`, otherwise use `iter_batched`. These methods are also
+///   suitable for benchmarking routines which return a value with an expensive `drop` method,
+///   but are more complex than `iter_with_large_drop`.
+/// * Otherwise, use `iter`.
+pub struct Bencher<'a, M: Measurement = WallTime> {
+    pub(crate) iterated: bool,         // Have we iterated this benchmark?
+    pub(crate) iters: u64,             // Number of times to iterate this benchmark
+    pub(crate) value: M::Value,        // The measured value
+    pub(crate) measurement: &'a M,     // Reference to the measurement object
+    pub(crate) elapsed_time: Duration, // How much time did it take to perform the iteration? Used for the warmup period.
+}
+impl<'a, M: Measurement> Bencher<'a, M> {
+    /// Times a `routine` by executing it many times and timing the total elapsed time.
+    ///
+    /// Prefer this timing loop when `routine` returns a value that doesn't have a destructor.
+    ///
+    /// # Timing model
+    ///
+    /// Note that the `Bencher` also times the time required to destroy the output of `routine()`.
+    /// Therefore prefer this timing loop when the runtime of `mem::drop(O)` is negligible compared
+    /// to the runtime of the `routine`.
+    ///
+    /// ```text
+    /// elapsed = Instant::now + iters * (routine + mem::drop(O) + Range::next)
+    /// ```
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    ///
+    /// use criterion::*;
+    ///
+    /// // The function to benchmark
+    /// fn foo() {
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     c.bench_function("iter", move |b| {
+    ///         b.iter(|| foo())
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    #[inline(never)]
+    pub fn iter<O, R>(&mut self, mut routine: R)
+    where
+        R: FnMut() -> O,
+    {
+        self.__codspeed_root_frame__iter(routine)
+    }
+
+    #[inline(never)]
+    #[allow(non_snake_case, missing_docs)]
+    pub fn __codspeed_root_frame__iter<O, R>(&mut self, mut routine: R)
+    where
+        R: FnMut() -> O,
+    {
+        self.iterated = true;
+        let time_start = Instant::now();
+        let start = self.measurement.start();
+        for _ in 0..self.iters {
+            black_box(routine());
+        }
+        self.value = self.measurement.end(start);
+        self.elapsed_time = time_start.elapsed();
+    }
+
+    /// Times a `routine` by executing it many times and relying on `routine` to measure its own execution time.
+    ///
+    /// Prefer this timing loop in cases where `routine` has to do its own measurements to
+    /// get accurate timing information (for example in multi-threaded scenarios where you spawn
+    /// and coordinate with multiple threads).
+    ///
+    /// # Timing model
+    /// Custom, the timing model is whatever is returned as the Duration from `routine`.
+    ///
+    /// # Example
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    /// use criterion::*;
+    /// use criterion::black_box;
+    /// use std::time::Instant;
+    ///
+    /// fn foo() {
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     c.bench_function("iter", move |b| {
+    ///         b.iter_custom(|iters| {
+    ///             let start = Instant::now();
+    ///             for _i in 0..iters {
+    ///                 black_box(foo());
+    ///             }
+    ///             start.elapsed()
+    ///         })
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    #[inline(never)]
+    pub fn iter_custom<R>(&mut self, mut routine: R)
+    where
+        R: FnMut(u64) -> M::Value,
+    {
+        self.__codspeed_root_frame__iter_custom(routine)
+    }
+
+    #[inline(never)]
+    #[allow(missing_docs, non_snake_case)]
+    pub fn __codspeed_root_frame__iter_custom<R>(&mut self, mut routine: R)
+    where
+        R: FnMut(u64) -> M::Value,
+    {
+        self.iterated = true;
+        let time_start = Instant::now();
+        self.value = routine(self.iters);
+        self.elapsed_time = time_start.elapsed();
+    }
+
+    #[doc(hidden)]
+    pub fn iter_with_setup<I, O, S, R>(&mut self, setup: S, routine: R)
+    where
+        S: FnMut() -> I,
+        R: FnMut(I) -> O,
+    {
+        self.iter_batched(setup, routine, BatchSize::PerIteration);
+    }
+
+    /// Times a `routine` by collecting its output on each iteration. This avoids timing the
+    /// destructor of the value returned by `routine`.
+    ///
+    /// WARNING: This requires `O(iters * mem::size_of::<O>())` of memory, and `iters` is not under the
+    /// control of the caller. If this causes out-of-memory errors, use `iter_batched` instead.
+    ///
+    /// # Timing model
+    ///
+    /// ``` text
+    /// elapsed = Instant::now + iters * (routine) + Iterator::collect::<Vec<_>>
+    /// ```
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    ///
+    /// use criterion::*;
+    ///
+    /// fn create_vector() -> Vec<u64> {
+    ///     # vec![]
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     c.bench_function("with_drop", move |b| {
+    ///         // This will avoid timing the Vec::drop.
+    ///         b.iter_with_large_drop(|| create_vector())
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    pub fn iter_with_large_drop<O, R>(&mut self, mut routine: R)
+    where
+        R: FnMut() -> O,
+    {
+        self.iter_batched(|| (), |_| routine(), BatchSize::SmallInput);
+    }
+
+    /// Times a `routine` that requires some input by generating a batch of input, then timing the
+    /// iteration of the benchmark over the input. See [`BatchSize`](enum.BatchSize.html) for
+    /// details on choosing the batch size. Use this when the routine must consume its input.
+    ///
+    /// For example, use this loop to benchmark sorting algorithms, because they require unsorted
+    /// data on each iteration.
+    ///
+    /// # Timing model
+    ///
+    /// ```text
+    /// elapsed = (Instant::now * num_batches) + (iters * (routine + O::drop)) + Vec::extend
+    /// ```
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    ///
+    /// use criterion::*;
+    ///
+    /// fn create_scrambled_data() -> Vec<u64> {
+    ///     # vec![]
+    ///     // ...
+    /// }
+    ///
+    /// // The sorting algorithm to test
+    /// fn sort(data: &mut [u64]) {
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     let data = create_scrambled_data();
+    ///
+    ///     c.bench_function("with_setup", move |b| {
+    ///         // This will avoid timing the to_vec call.
+    ///         b.iter_batched(|| data.clone(), |mut data| sort(&mut data), BatchSize::SmallInput)
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    #[inline(never)]
+    pub fn iter_batched<I, O, S, R>(&mut self, mut setup: S, mut routine: R, size: BatchSize)
+    where
+        S: FnMut() -> I,
+        R: FnMut(I) -> O,
+    {
+        self.__codspeed_root_frame__iter_batched(setup, routine, size);
+    }
+
+    #[inline(never)]
+    #[allow(missing_docs, non_snake_case)]
+    pub fn __codspeed_root_frame__iter_batched<I, O, S, R>(
+        &mut self,
+        mut setup: S,
+        mut routine: R,
+        size: BatchSize,
+    ) where
+        S: FnMut() -> I,
+        R: FnMut(I) -> O,
+    {
+        self.iterated = true;
+        let batch_size = size.iters_per_batch(self.iters);
+        assert!(batch_size != 0, "Batch size must not be zero.");
+        let time_start = Instant::now();
+        self.value = self.measurement.zero();
+
+        if batch_size == 1 {
+            for _ in 0..self.iters {
+                let input = black_box(setup());
+
+                let start = self.measurement.start();
+                let output = routine(input);
+                let end = self.measurement.end(start);
+                self.value = self.measurement.add(&self.value, &end);
+
+                drop(black_box(output));
+            }
+        } else {
+            let mut iteration_counter = 0;
+
+            while iteration_counter < self.iters {
+                let batch_size = ::std::cmp::min(batch_size, self.iters - iteration_counter);
+
+                let inputs = black_box((0..batch_size).map(|_| setup()).collect::<Vec<_>>());
+                let mut outputs = Vec::with_capacity(batch_size as usize);
+
+                let start = self.measurement.start();
+                outputs.extend(inputs.into_iter().map(&mut routine));
+                let end = self.measurement.end(start);
+                self.value = self.measurement.add(&self.value, &end);
+
+                black_box(outputs);
+
+                iteration_counter += batch_size;
+            }
+        }
+
+        self.elapsed_time = time_start.elapsed();
+    }
+
+    /// Times a `routine` that requires some input by generating a batch of input, then timing the
+    /// iteration of the benchmark over the input. See [`BatchSize`](enum.BatchSize.html) for
+    /// details on choosing the batch size. Use this when the routine should accept the input by
+    /// mutable reference.
+    ///
+    /// For example, use this loop to benchmark sorting algorithms, because they require unsorted
+    /// data on each iteration.
+    ///
+    /// # Timing model
+    ///
+    /// ```text
+    /// elapsed = (Instant::now * num_batches) + (iters * routine) + Vec::extend
+    /// ```
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    ///
+    /// use criterion::*;
+    ///
+    /// fn create_scrambled_data() -> Vec<u64> {
+    ///     # vec![]
+    ///     // ...
+    /// }
+    ///
+    /// // The sorting algorithm to test
+    /// fn sort(data: &mut [u64]) {
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     let data = create_scrambled_data();
+    ///
+    ///     c.bench_function("with_setup", move |b| {
+    ///         // This will avoid timing the to_vec call.
+    ///         b.iter_batched(|| data.clone(), |mut data| sort(&mut data), BatchSize::SmallInput)
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    #[inline(never)]
+    pub fn iter_batched_ref<I, O, S, R>(&mut self, mut setup: S, mut routine: R, size: BatchSize)
+    where
+        S: FnMut() -> I,
+        R: FnMut(&mut I) -> O,
+    {
+        self.__codspeed_root_frame__iter_batched_ref(setup, routine, size)
+    }
+
+    #[inline(never)]
+    #[allow(missing_docs, non_snake_case)]
+    pub fn __codspeed_root_frame__iter_batched_ref<I, O, S, R>(
+        &mut self,
+        mut setup: S,
+        mut routine: R,
+        size: BatchSize,
+    ) where
+        S: FnMut() -> I,
+        R: FnMut(&mut I) -> O,
+    {
+        self.iterated = true;
+        let batch_size = size.iters_per_batch(self.iters);
+        assert!(batch_size != 0, "Batch size must not be zero.");
+        let time_start = Instant::now();
+        self.value = self.measurement.zero();
+
+        if batch_size == 1 {
+            for _ in 0..self.iters {
+                let mut input = black_box(setup());
+
+                let start = self.measurement.start();
+                let output = routine(&mut input);
+                let end = self.measurement.end(start);
+                self.value = self.measurement.add(&self.value, &end);
+
+                drop(black_box(output));
+                drop(black_box(input));
+            }
+        } else {
+            let mut iteration_counter = 0;
+
+            while iteration_counter < self.iters {
+                let batch_size = ::std::cmp::min(batch_size, self.iters - iteration_counter);
+
+                let mut inputs = black_box((0..batch_size).map(|_| setup()).collect::<Vec<_>>());
+                let mut outputs = Vec::with_capacity(batch_size as usize);
+
+                let start = self.measurement.start();
+                outputs.extend(inputs.iter_mut().map(&mut routine));
+                let end = self.measurement.end(start);
+                self.value = self.measurement.add(&self.value, &end);
+
+                black_box(outputs);
+
+                iteration_counter += batch_size;
+            }
+        }
+        self.elapsed_time = time_start.elapsed();
+    }
+
+    // Benchmarks must actually call one of the iter methods. This causes benchmarks to fail loudly
+    // if they don't.
+    pub(crate) fn assert_iterated(&mut self) {
+        assert!(
+            self.iterated,
+            "Benchmark function must call Bencher::iter or related method."
+        );
+        self.iterated = false;
+    }
+
+    /// Convert this bencher into an AsyncBencher, which enables async/await support.
+    #[cfg(feature = "async")]
+    pub fn to_async<'b, A: AsyncExecutor>(&'b mut self, runner: A) -> AsyncBencher<'a, 'b, A, M> {
+        AsyncBencher { b: self, runner }
+    }
+}
+
+/// Async/await variant of the Bencher struct.
+#[cfg(feature = "async")]
+pub struct AsyncBencher<'a, 'b, A: AsyncExecutor, M: Measurement = WallTime> {
+    b: &'b mut Bencher<'a, M>,
+    runner: A,
+}
+#[cfg(feature = "async")]
+impl<'a, 'b, A: AsyncExecutor, M: Measurement> AsyncBencher<'a, 'b, A, M> {
+    /// Times a `routine` by executing it many times and timing the total elapsed time.
+    ///
+    /// Prefer this timing loop when `routine` returns a value that doesn't have a destructor.
+    ///
+    /// # Timing model
+    ///
+    /// Note that the `AsyncBencher` also times the time required to destroy the output of `routine()`.
+    /// Therefore prefer this timing loop when the runtime of `mem::drop(O)` is negligible compared
+    /// to the runtime of the `routine`.
+    ///
+    /// ```text
+    /// elapsed = Instant::now + iters * (routine + mem::drop(O) + Range::next)
+    /// ```
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    ///
+    /// use criterion::*;
+    /// use criterion::async_executor::FuturesExecutor;
+    ///
+    /// // The function to benchmark
+    /// async fn foo() {
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     c.bench_function("iter", move |b| {
+    ///         b.to_async(FuturesExecutor).iter(|| async { foo().await } )
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    #[inline(never)]
+    pub fn iter<O, R, F>(&mut self, mut routine: R)
+    where
+        R: FnMut() -> F,
+        F: Future<Output = O>,
+    {
+        self.__codspeed_root_frame__iter(routine)
+    }
+
+    #[inline(never)]
+    #[allow(non_snake_case, missing_docs)]
+    pub fn __codspeed_root_frame__iter<O, R, F>(&mut self, mut routine: R)
+    where
+        R: FnMut() -> F,
+        F: Future<Output = O>,
+    {
+        let AsyncBencher { b, runner } = self;
+        runner.block_on(async {
+            b.iterated = true;
+            let time_start = Instant::now();
+            let start = b.measurement.start();
+            for _ in 0..b.iters {
+                black_box(routine().await);
+            }
+            b.value = b.measurement.end(start);
+            b.elapsed_time = time_start.elapsed();
+        });
+    }
+
+    /// Times a `routine` by executing it many times and relying on `routine` to measure its own execution time.
+    ///
+    /// Prefer this timing loop in cases where `routine` has to do its own measurements to
+    /// get accurate timing information (for example in multi-threaded scenarios where you spawn
+    /// and coordinate with multiple threads).
+    ///
+    /// # Timing model
+    /// Custom, the timing model is whatever is returned as the Duration from `routine`.
+    ///
+    /// # Example
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    /// use criterion::*;
+    /// use criterion::black_box;
+    /// use criterion::async_executor::FuturesExecutor;
+    /// use std::time::Instant;
+    ///
+    /// async fn foo() {
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     c.bench_function("iter", move |b| {
+    ///         b.to_async(FuturesExecutor).iter_custom(|iters| {
+    ///             async move {
+    ///                 let start = Instant::now();
+    ///                 for _i in 0..iters {
+    ///                     black_box(foo().await);
+    ///                 }
+    ///                 start.elapsed()
+    ///             }
+    ///         })
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    #[inline(never)]
+    pub fn iter_custom<R, F>(&mut self, mut routine: R)
+    where
+        R: FnMut(u64) -> F,
+        F: Future<Output = M::Value>,
+    {
+        self.__codspeed_root_frame__iter_custom(routine)
+    }
+
+    #[inline(never)]
+    #[allow(non_snake_case, missing_docs)]
+    pub fn __codspeed_root_frame__iter_custom<R, F>(&mut self, mut routine: R)
+    where
+        R: FnMut(u64) -> F,
+        F: Future<Output = M::Value>,
+    {
+        let AsyncBencher { b, runner } = self;
+        runner.block_on(async {
+            b.iterated = true;
+            let time_start = Instant::now();
+            b.value = routine(b.iters).await;
+            b.elapsed_time = time_start.elapsed();
+        })
+    }
+
+    #[doc(hidden)]
+    pub fn iter_with_setup<I, O, S, R, F>(&mut self, setup: S, routine: R)
+    where
+        S: FnMut() -> I,
+        R: FnMut(I) -> F,
+        F: Future<Output = O>,
+    {
+        self.iter_batched(setup, routine, BatchSize::PerIteration);
+    }
+
+    /// Times a `routine` by collecting its output on each iteration. This avoids timing the
+    /// destructor of the value returned by `routine`.
+    ///
+    /// WARNING: This requires `O(iters * mem::size_of::<O>())` of memory, and `iters` is not under the
+    /// control of the caller. If this causes out-of-memory errors, use `iter_batched` instead.
+    ///
+    /// # Timing model
+    ///
+    /// ``` text
+    /// elapsed = Instant::now + iters * (routine) + Iterator::collect::<Vec<_>>
+    /// ```
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    ///
+    /// use criterion::*;
+    /// use criterion::async_executor::FuturesExecutor;
+    ///
+    /// async fn create_vector() -> Vec<u64> {
+    ///     # vec![]
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     c.bench_function("with_drop", move |b| {
+    ///         // This will avoid timing the Vec::drop.
+    ///         b.to_async(FuturesExecutor).iter_with_large_drop(|| async { create_vector().await })
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    pub fn iter_with_large_drop<O, R, F>(&mut self, mut routine: R)
+    where
+        R: FnMut() -> F,
+        F: Future<Output = O>,
+    {
+        self.iter_batched(|| (), |_| routine(), BatchSize::SmallInput);
+    }
+
+    #[doc(hidden)]
+    pub fn iter_with_large_setup<I, O, S, R, F>(&mut self, setup: S, routine: R)
+    where
+        S: FnMut() -> I,
+        R: FnMut(I) -> F,
+        F: Future<Output = O>,
+    {
+        self.iter_batched(setup, routine, BatchSize::NumBatches(1));
+    }
+
+    /// Times a `routine` that requires some input by generating a batch of input, then timing the
+    /// iteration of the benchmark over the input. See [`BatchSize`](enum.BatchSize.html) for
+    /// details on choosing the batch size. Use this when the routine must consume its input.
+    ///
+    /// For example, use this loop to benchmark sorting algorithms, because they require unsorted
+    /// data on each iteration.
+    ///
+    /// # Timing model
+    ///
+    /// ```text
+    /// elapsed = (Instant::now * num_batches) + (iters * (routine + O::drop)) + Vec::extend
+    /// ```
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    ///
+    /// use criterion::*;
+    /// use criterion::async_executor::FuturesExecutor;
+    ///
+    /// fn create_scrambled_data() -> Vec<u64> {
+    ///     # vec![]
+    ///     // ...
+    /// }
+    ///
+    /// // The sorting algorithm to test
+    /// async fn sort(data: &mut [u64]) {
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     let data = create_scrambled_data();
+    ///
+    ///     c.bench_function("with_setup", move |b| {
+    ///         // This will avoid timing the to_vec call.
+    ///         b.iter_batched(|| data.clone(), |mut data| async move { sort(&mut data).await }, BatchSize::SmallInput)
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    #[inline(never)]
+    pub fn iter_batched<I, O, S, R, F>(&mut self, mut setup: S, mut routine: R, size: BatchSize)
+    where
+        S: FnMut() -> I,
+        R: FnMut(I) -> F,
+        F: Future<Output = O>,
+    {
+        self.__codspeed_root_frame__iter_batched(setup, routine, size);
+    }
+
+    #[inline(never)]
+    #[allow(non_snake_case, missing_docs)]
+    pub fn __codspeed_root_frame__iter_batched<I, O, S, R, F>(
+        &mut self,
+        mut setup: S,
+        mut routine: R,
+        size: BatchSize,
+    ) where
+        S: FnMut() -> I,
+        R: FnMut(I) -> F,
+        F: Future<Output = O>,
+    {
+        let AsyncBencher { b, runner } = self;
+        runner.block_on(async {
+            b.iterated = true;
+            let batch_size = size.iters_per_batch(b.iters);
+            assert!(batch_size != 0, "Batch size must not be zero.");
+            let time_start = Instant::now();
+            b.value = b.measurement.zero();
+
+            if batch_size == 1 {
+                for _ in 0..b.iters {
+                    let input = black_box(setup());
+
+                    let start = b.measurement.start();
+                    let output = routine(input).await;
+                    let end = b.measurement.end(start);
+                    b.value = b.measurement.add(&b.value, &end);
+
+                    drop(black_box(output));
+                }
+            } else {
+                let mut iteration_counter = 0;
+
+                while iteration_counter < b.iters {
+                    let batch_size = ::std::cmp::min(batch_size, b.iters - iteration_counter);
+
+                    let inputs = black_box((0..batch_size).map(|_| setup()).collect::<Vec<_>>());
+                    let mut outputs = Vec::with_capacity(batch_size as usize);
+
+                    let start = b.measurement.start();
+                    // Can't use .extend here like the sync version does
+                    for input in inputs {
+                        outputs.push(routine(input).await);
+                    }
+                    let end = b.measurement.end(start);
+                    b.value = b.measurement.add(&b.value, &end);
+
+                    black_box(outputs);
+
+                    iteration_counter += batch_size;
+                }
+            }
+
+            b.elapsed_time = time_start.elapsed();
+        })
+    }
+
+    /// Times a `routine` that requires some input by generating a batch of input, then timing the
+    /// iteration of the benchmark over the input. See [`BatchSize`](enum.BatchSize.html) for
+    /// details on choosing the batch size. Use this when the routine should accept the input by
+    /// mutable reference.
+    ///
+    /// For example, use this loop to benchmark sorting algorithms, because they require unsorted
+    /// data on each iteration.
+    ///
+    /// # Timing model
+    ///
+    /// ```text
+    /// elapsed = (Instant::now * num_batches) + (iters * routine) + Vec::extend
+    /// ```
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// #[macro_use] extern crate criterion;
+    ///
+    /// use criterion::*;
+    /// use criterion::async_executor::FuturesExecutor;
+    ///
+    /// fn create_scrambled_data() -> Vec<u64> {
+    ///     # vec![]
+    ///     // ...
+    /// }
+    ///
+    /// // The sorting algorithm to test
+    /// async fn sort(data: &mut [u64]) {
+    ///     // ...
+    /// }
+    ///
+    /// fn bench(c: &mut Criterion) {
+    ///     let data = create_scrambled_data();
+    ///
+    ///     c.bench_function("with_setup", move |b| {
+    ///         // This will avoid timing the to_vec call.
+    ///         b.iter_batched(|| data.clone(), |mut data| async move { sort(&mut data).await }, BatchSize::SmallInput)
+    ///     });
+    /// }
+    ///
+    /// criterion_group!(benches, bench);
+    /// criterion_main!(benches);
+    /// ```
+    ///
+    #[inline(never)]
+    pub fn iter_batched_ref<I, O, S, R, F>(&mut self, mut setup: S, mut routine: R, size: BatchSize)
+    where
+        S: FnMut() -> I,
+        R: FnMut(&mut I) -> F,
+        F: Future<Output = O>,
+    {
+        self.__codspeed_root_frame__iter_batched_ref(setup, routine, size)
+    }
+
+    #[inline(never)]
+    #[allow(non_snake_case, missing_docs)]
+    pub fn __codspeed_root_frame__iter_batched_ref<I, O, S, R, F>(
+        &mut self,
+        mut setup: S,
+        mut routine: R,
+        size: BatchSize,
+    ) where
+        S: FnMut() -> I,
+        R: FnMut(&mut I) -> F,
+        F: Future<Output = O>,
+    {
+        let AsyncBencher { b, runner } = self;
+        runner.block_on(async {
+            b.iterated = true;
+            let batch_size = size.iters_per_batch(b.iters);
+            assert!(batch_size != 0, "Batch size must not be zero.");
+            let time_start = Instant::now();
+            b.value = b.measurement.zero();
+
+            if batch_size == 1 {
+                for _ in 0..b.iters {
+                    let mut input = black_box(setup());
+
+                    let start = b.measurement.start();
+                    let output = routine(&mut input).await;
+                    let end = b.measurement.end(start);
+                    b.value = b.measurement.add(&b.value, &end);
+
+                    drop(black_box(output));
+                    drop(black_box(input));
+                }
+            } else {
+                let mut iteration_counter = 0;
+
+                while iteration_counter < b.iters {
+                    let batch_size = ::std::cmp::min(batch_size, b.iters - iteration_counter);
+
+                    let inputs = black_box((0..batch_size).map(|_| setup()).collect::<Vec<_>>());
+                    let mut outputs = Vec::with_capacity(batch_size as usize);
+
+                    let start = b.measurement.start();
+                    // Can't use .extend here like the sync version does
+                    for mut input in inputs {
+                        outputs.push(routine(&mut input).await);
+                    }
+                    let end = b.measurement.end(start);
+                    b.value = b.measurement.add(&b.value, &end);
+
+                    black_box(outputs);
+
+                    iteration_counter += batch_size;
+                }
+            }
+            b.elapsed_time = time_start.elapsed();
+        });
+    }
+}
diff --git a/crates/criterion_compat/criterion_fork/src/routine.rs b/crates/criterion_compat/criterion_fork/src/routine.rs
index 88e4318b..f418b44f 100644
--- a/crates/criterion_compat/criterion_fork/src/routine.rs
+++ b/crates/criterion_compat/criterion_fork/src/routine.rs
@@ -191,8 +191,10 @@ pub(crate) trait Routine<M: Measurement, T: ?Sized> {
             .unwrap();
         }
 
-        let m_elapsed = self.bench(measurement, &m_iters, parameter);
-
+        let m_elapsed = {
+            let _guard = codspeed::fifo::BenchGuard::new_with_runner_fifo();
+            self.bench(measurement, &m_iters, parameter)
+        };
         let m_iters_f: Vec<f64> = m_iters.iter().map(|&x| x as f64).collect();
 
         (