diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index 868e7ce0e99..f0a27b6e40d 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -32,6 +32,108 @@ "syscall": "writev", "comment": "Used by the VirtIO net device to write to tap" }, + { + "syscall": "readv", + "comment": "Used by the VirtIO net device to read from tap" + }, + { + "syscall": "memfd_create", + "comment": "Used by the IovDeque implementation" + }, + { + "syscall": "fcntl", + "comment": "Used by the IovDeque implementation", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1033, + "comment": "FCNTL_F_SETFD" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 6, + "comment": "F_SEAL_SHRINK|F_SEAL_GROW" + } + ] + }, + { + "syscall": "fcntl", + "comment": "Used by the IovDeque implementation", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1033, + "comment": "FCNTL_F_SETFD" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 1, + "comment": "F_SEAL_SEAL" + } + ] + }, + { + "syscall": "mmap", + "comment": "Used by the IovDeque implementation", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 4096, + "comment": "Page size allocation" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 3, + "comment": "PROT_READ|PROT_WRITE" + }, + { + "index": 3, + "type": "dword", + "op": "eq", + "val": 17, + "comment": "MAP_SHARED|MAP_FIXED" + } + ] + }, + { + "syscall": "mmap", + "comment": "Used by the IovDeque implementation", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 8192, + "comment": "2 pages allocation" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 0, + "comment": "PROT_NONE" + }, + { + "index": 3, + "type": "dword", + "op": "eq", + "val": 34, + "comment": "MAP_PRIVATE|MAP_ANONYMOUS" + } + ] + }, { "syscall": "fsync" }, diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index e5b4b690196..630211f47d1 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -32,6 +32,108 @@ "syscall": "writev", "comment": "Used by the VirtIO net device to write to tap" }, + { + "syscall": "readv", + "comment": "Used by the VirtIO net device to read from tap" + }, + { + "syscall": "memfd_create", + "comment": "Used by the IovDeque implementation" + }, + { + "syscall": "fcntl", + "comment": "Used by the IovDeque implementation", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1033, + "comment": "FCNTL_F_SETFD" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 6, + "comment": "F_SEAL_SHRINK|F_SEAL_GROW" + } + ] + }, + { + "syscall": "fcntl", + "comment": "Used by the IovDeque implementation", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1033, + "comment": "FCNTL_F_SETFD" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 1, + "comment": "F_SEAL_SEAL" + } + ] + }, + { + "syscall": "mmap", + "comment": "Used by the IovDeque implementation", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 4096, + "comment": "Page size allocation" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 3, + "comment": "PROT_READ|PROT_WRITE" + }, + { + "index": 3, + "type": "dword", + "op": "eq", + "val": 17, + "comment": "MAP_SHARED|MAP_FIXED" + } + ] + }, + { + "syscall": "mmap", + "comment": "Used by the IovDeque implementation", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 8192, + "comment": "2 pages allocation" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 0, + "comment": "PROT_NONE" + }, + { + "index": 3, + "type": "dword", + "op": "eq", + "val": 34, + "comment": "MAP_PRIVATE|MAP_ANONYMOUS" + } + ] + }, { "syscall": "fsync" }, diff --git a/src/vmm/src/devices/virtio/iov_deque.rs b/src/vmm/src/devices/virtio/iov_deque.rs new file mode 100644 index 00000000000..0d801d10f3d --- /dev/null +++ b/src/vmm/src/devices/virtio/iov_deque.rs @@ -0,0 +1,449 @@ +// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::os::fd::AsRawFd; + +use libc::{c_int, c_void, iovec, off_t, size_t}; +use memfd; + +use super::queue::FIRECRACKER_MAX_QUEUE_SIZE; +use crate::arch::PAGE_SIZE; + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum IovDequeError { + /// Error with memfd: {0} + Memfd(#[from] memfd::Error), + /// Error while resizing memfd: {0} + MemfdResize(std::io::Error), + /// Error calling mmap: {0} + Mmap(std::io::Error), +} + +/// ['IovDeque'] is a ring buffer tailored for `struct iovec` objects. +/// +/// From the point of view of API, [`IovDeque`] is a typical ring buffer that allows us to push +/// `struct iovec` objects at the end of the buffer and pop them from its beginning. +/// +/// It is tailored to store `struct iovec` objects that described memory that was passed to us from +/// the guest via a VirtIO queue. This allows us to assume the maximum size of a ring buffer (the +/// negotiated size of the queue). +// An important feature of the data structure is that it can give us a slice of all `struct iovec` +// objects in the queue, so that we can use this `&mut [iovec]` to perform operations such as +// `readv`. A typical implementation of a ring buffer allows for entries to wrap around the end of +// the underlying buffer. For example, a ring buffer with a capacity of 10 elements which +// currently holds 4 elements can look like this: +// +// tail head +// | | +// v v +// +---+---+---+---+---+---+---+---+---+---+ +// ring buffer: | C | D | | | | | | | A | B | +// +---+---+---+---+---+---+---+---+---+---+ +// +// When getting a slice for this data we should get something like that: &[A, B, C, D], which +// would require copies in order to make the elements continuous in memory. +// +// In order to avoid that and make the operation of getting a slice more efficient, we implement +// the optimization described in the "Optimization" section of the "Circular buffer" wikipedia +// entry: https://en.wikipedia.org/wiki/Circular_buffer. The optimization consists of allocating +// double the size of the virtual memory required for the buffer and map both parts on the same +// physical address. Looking at the same example as before, we should get, this picture: +// +// head | tail +// | | | +// v | v +// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ +// | C | D | | | | | | | A | B | C | D | | | | | | | A | B | +// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ +// First virtual page | Second virtual page +// | +// | +// +// Virtual memory +// --------------------------------------------------------------------------------------- +// Physical memory +// +// +---+---+---+---+---+---+---+---+---+---+ +// | C | D | | | | | | | A | B | +// +---+---+---+---+---+---+---+---+---+---+ +// +// Like that, the elements stored in the buffer are always laid out in contiguous virtual memory, +// so making a slice out of them does not require any copies. +#[derive(Debug)] +pub struct IovDeque { + pub iov: *mut libc::iovec, + pub start: u16, + pub len: u16, +} + +// SAFETY: This is `Send`. We hold sole ownership of the underlying buffer. +unsafe impl Send for IovDeque {} + +impl IovDeque { + /// Create a [`memfd`] object that represents a single physical page + fn create_memfd() -> Result { + // Create a sealable memfd. + let opts = memfd::MemfdOptions::default().allow_sealing(true); + let mfd = opts.create("sized-1K")?; + + // Resize to system page size. + mfd.as_file() + .set_len(PAGE_SIZE.try_into().unwrap()) + .map_err(IovDequeError::MemfdResize)?; + + // Add seals to prevent further resizing. + mfd.add_seals(&[memfd::FileSeal::SealShrink, memfd::FileSeal::SealGrow])?; + + // Prevent further sealing changes. + mfd.add_seal(memfd::FileSeal::SealSeal)?; + + Ok(mfd) + } + + /// A safe wrapper on top of libc's `mmap` system call + /// + /// # Safety: Callers need to make sure that the arguments to `mmap` are valid + unsafe fn mmap( + addr: *mut c_void, + len: size_t, + prot: c_int, + flags: c_int, + fd: c_int, + offset: off_t, + ) -> Result<*mut c_void, IovDequeError> { + let ptr = libc::mmap(addr, len, prot, flags, fd, offset); + if ptr == libc::MAP_FAILED { + return Err(IovDequeError::Mmap(std::io::Error::last_os_error())); + } + + Ok(ptr) + } + + /// Allocate memory for our ring buffer + /// + /// This will allocate exactly two pages of virtual memory. In order to implement the + /// optimization that allows us to always have elements in contiguous memory we need + /// allocations at the granularity of `PAGE_SIZE`. Now, our queues are at maximum 256 + /// descriptors long and `struct iovec` looks like this: + /// + /// ```Rust + /// pub struct iovec { + /// pub iov_base: *mut ::c_void, + /// pub iov_len: ::size_t, + /// } + /// ``` + /// + /// so, it's 16 bytes long. As a result, we need a single page for holding the actual data of + /// our buffer. + fn allocate_ring_buffer_memory() -> Result<*mut c_void, IovDequeError> { + // The fact that we allocate two pages is due to the size of `struct iovec` times our queue + // size equals the page size. Add here a debug assertion to reflect that and ensure that we + // will adapt our logic if the assumption changes in the future. + const { + assert!( + std::mem::size_of::() * FIRECRACKER_MAX_QUEUE_SIZE as usize == PAGE_SIZE + ); + } + + // SAFETY: We are calling the system call with valid arguments + unsafe { + Self::mmap( + std::ptr::null_mut(), + PAGE_SIZE * 2, + libc::PROT_NONE, + libc::MAP_PRIVATE | libc::MAP_ANONYMOUS, + -1, + 0, + ) + } + } + + /// Create a new [`IovDeque`] that can hold memory described by a single VirtIO queue. + pub fn new() -> Result { + let memfd = Self::create_memfd()?; + let raw_memfd = memfd.as_file().as_raw_fd(); + let buffer = Self::allocate_ring_buffer_memory()?; + + // Map the first page of virtual memory to the physical page described by the memfd object + // SAFETY: We are calling the system call with valid arguments + let _ = unsafe { + Self::mmap( + buffer, + PAGE_SIZE, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_SHARED | libc::MAP_FIXED, + raw_memfd, + 0, + ) + }?; + + // Map the second page of virtual memory to the physical page described by the memfd object + // + // SAFETY: This is safe because: + // * Both `buffer` and the result of `buffer.add(PAGE_SIZE)` are within bounds of the + // allocation we got from `Self::allocate_ring_buffer_memory`. + // * The computed offset is `PAGE_SIZE * size_of::() == PAGE_SIZE bytes` which fits + // in `isize` + // * The resulting pointer is the beginning of the second page of our allocation, so it + // doesn't wrap around the address space. + let next_page = unsafe { buffer.add(PAGE_SIZE) }; + + // SAFETY: We are calling the system call with valid arguments + let _ = unsafe { + Self::mmap( + next_page, + PAGE_SIZE, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_SHARED | libc::MAP_FIXED, + raw_memfd, + 0, + ) + }?; + + Ok(Self { + iov: buffer.cast(), + start: 0, + len: 0, + }) + } + + /// Returns the number of `iovec` objects currently in the [`IovDeque`] + #[inline(always)] + pub fn len(&self) -> u16 { + self.len + } + + /// Returns `true` if the [`IovDeque`] is full, `false` otherwise + #[inline(always)] + pub fn is_full(&self) -> bool { + self.len() == FIRECRACKER_MAX_QUEUE_SIZE + } + + /// Resets the queue, dropping all its elements. + #[inline(always)] + pub fn clear(&mut self) { + self.start = 0; + self.len = 0; + } + + /// Adds an `iovec` in the ring buffer. + /// + /// Returns an `IovDequeError::Full` error if the buffer is full. + pub fn push_back(&mut self, iov: iovec) { + // This should NEVER happen, since our ring buffer is as big as the maximum queue size. + // We also check for the sanity of the VirtIO queues, in queue.rs, which means that if we + // ever try to add something in a full ring buffer, there is an internal bug in the device + // emulation logic. Panic here because the device is hopelessly broken. + assert!( + !self.is_full(), + "The number of `iovec` objects is bigger than the available space" + ); + + // SAFETY: self.iov is a valid pointer and `self.start + self.len` is within range (we + // asserted before that the buffer is not full). + unsafe { + self.iov + .add((self.start + self.len) as usize) + .write_volatile(iov) + }; + self.len += 1; + } + + /// Pops the first `nr_iovecs` iovecs from the buffer. + /// + /// Returns the total number of bytes of all the popped iovecs. This will panic if we are asked + /// to pop more iovecs than what is currently available in the buffer. + pub fn pop_front(&mut self, nr_iovecs: u16) { + assert!( + self.len() >= nr_iovecs, + "Internal bug! Trying to drop more iovec objects than what is available" + ); + + self.start += nr_iovecs; + self.len -= nr_iovecs; + if self.start >= FIRECRACKER_MAX_QUEUE_SIZE { + self.start -= FIRECRACKER_MAX_QUEUE_SIZE; + } + } + + /// Get a slice of the iovec objects currently in the buffer. + pub fn as_slice(&self) -> &[iovec] { + // SAFETY: Here we create a slice out of the existing elements in the buffer (not the whole + // allocated memory). That means that we can: + // * We can read `self.len * mem::size_of::()` bytes out of the memory range we are + // returning. + // * `self.iov.add(self.start.into())` is a non-null pointer and aligned. + // * The underlying memory comes from a single allocation. + // * The returning pointer points to `self.len` consecutive initialized `iovec` objects. + // * We are only accessing the underlying memory through the returned slice. Since we are + // returning a slice of only the existing pushed elements the slice does not contain any + // aliasing references. + // * The slice can be up to 1 page long which is smaller than `isize::MAX`. + unsafe { + let slice_start = self.iov.add(self.start.into()); + std::slice::from_raw_parts(slice_start, self.len.into()) + } + } + + /// Get a mutable slice of the iovec objects currently in the buffer. + pub fn as_mut_slice(&mut self) -> &mut [iovec] { + // SAFETY: Here we create a slice out of the existing elements in the buffer (not the whole + // allocated memory). That means that we can: + // * We can read/write `self.len * mem::size_of::()` bytes out of the memory range we + // are returning. + // * The underlying memory comes from a single allocation. + // * `self.iov.add(self.start.into())` is a non-null pointer and aligned + // * The returning pointer points to `self.len` consecutive initialized `iovec` objects. + // * We are only accessing the underlying memory through the returned slice. Since we are + // returning a slice of only the existing pushed elements the slice does not contain any + // aliasing references. + // * The slice can be up to 1 page long which is smaller than `isize::MAX`. + unsafe { + let slice_start = self.iov.add(self.start.into()); + std::slice::from_raw_parts_mut(slice_start, self.len.into()) + } + } +} + +impl Drop for IovDeque { + fn drop(&mut self) { + // SAFETY: We are passing an address that we got from a previous allocation of `2 * + // PAGE_SIZE` bytes by calling mmap + let _ = unsafe { libc::munmap(self.iov.cast(), PAGE_SIZE * 2) }; + } +} + +#[cfg(test)] +mod tests { + use libc::iovec; + + use super::IovDeque; + + #[test] + fn test_new() { + let deque = IovDeque::new().unwrap(); + assert_eq!(deque.len(), 0); + } + + fn make_iovec(id: u16, len: u16) -> iovec { + iovec { + iov_base: id as *mut libc::c_void, + iov_len: len as usize, + } + } + + #[test] + #[should_panic] + fn test_push_back_too_many() { + let mut deque = IovDeque::new().unwrap(); + assert_eq!(deque.len(), 0); + + for i in 0u16..256 { + deque.push_back(make_iovec(i, i)); + assert_eq!(deque.len(), i + 1); + } + + deque.push_back(make_iovec(0, 0)); + } + + #[test] + #[should_panic] + fn test_pop_front_from_empty() { + let mut deque = IovDeque::new().unwrap(); + deque.pop_front(1); + } + + #[test] + #[should_panic] + fn test_pop_front_too_many() { + let mut deque = IovDeque::new().unwrap(); + deque.push_back(make_iovec(42, 42)); + deque.pop_front(2); + } + + #[test] + fn test_pop() { + let mut deque = IovDeque::new().unwrap(); + assert_eq!(deque.len(), 0); + assert!(!deque.is_full()); + deque.pop_front(0); + + for i in 0u16..256 { + deque.push_back(make_iovec(i, i)); + assert_eq!(deque.len(), i + 1); + } + + assert!(deque.is_full()); + assert!(deque.len() != 0); + + for i in 0u16..256 { + deque.pop_front(1); + assert_eq!(deque.len(), 256 - i - 1); + } + } + + #[test] + fn test_pop_many() { + let mut deque = IovDeque::new().unwrap(); + + for i in 0u16..256 { + deque.push_back(make_iovec(i, i)); + } + + deque.pop_front(1); + assert_eq!(deque.len(), 255); + deque.pop_front(2); + assert_eq!(deque.len(), 253); + deque.pop_front(4); + assert_eq!(deque.len(), 249); + deque.pop_front(8); + assert_eq!(deque.len(), 241); + deque.pop_front(16); + assert_eq!(deque.len(), 225); + deque.pop_front(32); + assert_eq!(deque.len(), 193); + deque.pop_front(64); + assert_eq!(deque.len(), 129); + deque.pop_front(128); + assert_eq!(deque.len(), 1); + } + + #[test] + fn test_as_slice() { + let mut deque = IovDeque::new().unwrap(); + assert!(deque.as_slice().is_empty()); + + for i in 0..256 { + deque.push_back(make_iovec(i, 100)); + assert_eq!(deque.as_slice().len(), (i + 1) as usize); + } + let copy: Vec = deque.as_slice().to_vec(); + + assert_eq!(copy.len(), deque.len() as usize); + for (i, iov) in deque.as_slice().iter().enumerate() { + assert_eq!(iov.iov_len, copy[i].iov_len); + } + } + + #[test] + fn test_as_mut_slice() { + let mut deque = IovDeque::new().unwrap(); + assert!(deque.as_mut_slice().is_empty()); + + for i in 0..256 { + deque.push_back(make_iovec(i, 100)); + assert_eq!(deque.as_mut_slice().len(), (i + 1) as usize); + } + + let copy: Vec = deque.as_mut_slice().to_vec(); + deque + .as_mut_slice() + .iter_mut() + .for_each(|iov| iov.iov_len *= 2); + + assert_eq!(copy.len(), deque.len() as usize); + for (i, iov) in deque.as_slice().iter().enumerate() { + assert_eq!(iov.iov_len, 2 * copy[i].iov_len); + } + } +} diff --git a/src/vmm/src/devices/virtio/iovec.rs b/src/vmm/src/devices/virtio/iovec.rs index 3acde02fc05..5a015975df0 100644 --- a/src/vmm/src/devices/virtio/iovec.rs +++ b/src/vmm/src/devices/virtio/iovec.rs @@ -4,12 +4,15 @@ use std::io::ErrorKind; use libc::{c_void, iovec, size_t}; +use serde::{Deserialize, Serialize}; +#[cfg(not(kani))] use smallvec::SmallVec; use vm_memory::bitmap::Bitmap; use vm_memory::{ GuestMemory, GuestMemoryError, ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile, }; +use super::iov_deque::{IovDeque, IovDequeError}; use crate::devices::virtio::queue::DescriptorChain; use crate::vstate::memory::GuestMemoryMmap; @@ -23,6 +26,8 @@ pub enum IoVecError { OverflowedDescriptor, /// Guest memory error: {0} GuestMemory(#[from] GuestMemoryError), + /// Error with underlying `IovDeque`: {0} + IovDeque(#[from] IovDequeError), } // Using SmallVec in the kani proofs causes kani to use unbounded amounts of memory @@ -214,42 +219,55 @@ impl IoVecBuffer { } } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ParsedDescriptorChain { + pub head_index: u16, + pub length: u32, + pub nr_iovecs: u16, +} + /// This is essentially a wrapper of a `Vec` which can be passed to `libc::readv`. /// /// It describes a write-only buffer passed to us by the guest that is scattered across multiple /// memory regions. Additionally, this wrapper provides methods that allow reading arbitrary ranges /// of data from that buffer. -#[derive(Debug, Default, Clone)] +#[derive(Debug)] pub struct IoVecBufferMut { // container of the memory regions included in this IO vector - vecs: IoVecVec, + pub vecs: IovDeque, // Total length of the IoVecBufferMut - len: u32, + pub len: usize, } impl IoVecBufferMut { - /// Create an `IoVecBuffer` from a `DescriptorChain` + /// Parse a `DescriptorChain` object and append the memory regions it describes in the + /// underlying ring buffer. /// /// # Safety /// /// The descriptor chain cannot be referencing the same memory location as another chain - pub unsafe fn load_descriptor_chain( + unsafe fn parse_descriptor( &mut self, mem: &GuestMemoryMmap, head: DescriptorChain, - ) -> Result<(), IoVecError> { - self.clear(); - + ) -> Result { + let head_index = head.index; let mut next_descriptor = Some(head); + let mut length = 0u32; + let mut nr_iovecs = 0u16; while let Some(desc) = next_descriptor { if !desc.is_write_only() { + self.vecs.pop_front(nr_iovecs); return Err(IoVecError::ReadOnlyDescriptor); } // We use get_slice instead of `get_host_address` here in order to have the whole // range of the descriptor chain checked, i.e. [addr, addr + len) is a valid memory // region in the GuestMemoryMmap. - let slice = mem.get_slice(desc.addr, desc.len as usize)?; + let slice = mem.get_slice(desc.addr, desc.len as usize).map_err(|err| { + self.vecs.pop_front(nr_iovecs); + err + })?; // We need to mark the area of guest memory that will be mutated through this // IoVecBufferMut as dirty ahead of time, as we loose access to all @@ -257,21 +275,79 @@ impl IoVecBufferMut { slice.bitmap().mark_dirty(0, desc.len as usize); let iov_base = slice.ptr_guard_mut().as_ptr().cast::(); - self.vecs.push(iovec { + self.vecs.push_back(iovec { iov_base, iov_len: desc.len as size_t, }); - self.len = self - .len + nr_iovecs += 1; + length = length .checked_add(desc.len) - .ok_or(IoVecError::OverflowedDescriptor)?; + .ok_or(IoVecError::OverflowedDescriptor) + .map_err(|err| { + self.vecs.pop_front(nr_iovecs); + err + })?; next_descriptor = desc.next_descriptor(); } + self.len = self.len.checked_add(length as usize).ok_or_else(|| { + self.vecs.pop_front(nr_iovecs); + IoVecError::OverflowedDescriptor + })?; + + Ok(ParsedDescriptorChain { + head_index, + length, + nr_iovecs, + }) + } + + /// Create an empty `IoVecBufferMut`. + pub fn new() -> Result { + let vecs = IovDeque::new()?; + Ok(Self { vecs, len: 0 }) + } + + /// Create an `IoVecBufferMut` from a `DescriptorChain` + /// + /// This will clear any previous `iovec` objects in the buffer and load the new + /// [`DescriptorChain`]. + /// + /// # Safety + /// + /// The descriptor chain cannot be referencing the same memory location as another chain + pub unsafe fn load_descriptor_chain( + &mut self, + mem: &GuestMemoryMmap, + head: DescriptorChain, + ) -> Result<(), IoVecError> { + self.clear(); + let _ = self.parse_descriptor(mem, head)?; Ok(()) } + /// Append a `DescriptorChain` in this `IoVecBufferMut` + /// + /// # Safety + /// + /// The descriptor chain cannot be referencing the same memory location as another chain + pub unsafe fn append_descriptor_chain( + &mut self, + mem: &GuestMemoryMmap, + head: DescriptorChain, + ) -> Result { + self.parse_descriptor(mem, head) + } + + /// Drop memory from the `IoVecBufferMut` + /// + /// This will drop memory described by the `IoVecBufferMut` from the beginning. + pub fn drop_descriptor_chain(&mut self, parse_descriptor: &ParsedDescriptorChain) { + self.vecs.pop_front(parse_descriptor.nr_iovecs); + self.len -= parse_descriptor.length as usize; + } + /// Create an `IoVecBuffer` from a `DescriptorChain` /// /// # Safety @@ -281,20 +357,29 @@ impl IoVecBufferMut { mem: &GuestMemoryMmap, head: DescriptorChain, ) -> Result { - let mut new_buffer = Self::default(); + let mut new_buffer = Self::new()?; new_buffer.load_descriptor_chain(mem, head)?; Ok(new_buffer) } /// Get the total length of the memory regions covered by this `IoVecBuffer` - pub(crate) fn len(&self) -> u32 { + /// + /// In contrast to the equivalent [`IoVecBuffer::len()`] which returns `u32`, this one returns + /// `usize` since the buffer can contain multiple `DescriptorChain` objects, so we don't have + /// the limit that the length of a buffer is limited by `u32`. + pub(crate) fn len(&self) -> usize { self.len } + /// Returns a pointer to the memory keeping the `iovec` structs + pub fn as_iovec_mut_slice(&mut self) -> &mut [iovec] { + self.vecs.as_mut_slice() + } + /// Clears the `iovec` array pub fn clear(&mut self) { self.vecs.clear(); - self.len = 0u32; + self.len = 0; } /// Writes a number of bytes into the `IoVecBufferMut` starting at a given offset. @@ -313,7 +398,7 @@ impl IoVecBufferMut { mut buf: &[u8], offset: usize, ) -> Result<(), VolatileMemoryError> { - if offset < self.len() as usize { + if offset < self.len() { let expected = buf.len(); let bytes_written = self.write_volatile_at(&mut buf, offset, expected)?; @@ -342,7 +427,7 @@ impl IoVecBufferMut { ) -> Result { let mut total_bytes_read = 0; - for iov in &self.vecs { + for iov in self.vecs.as_slice() { if len == 0 { break; } @@ -391,6 +476,7 @@ mod tests { use vm_memory::VolatileMemoryError; use super::{IoVecBuffer, IoVecBufferMut}; + use crate::devices::virtio::iov_deque::IovDeque; use crate::devices::virtio::queue::{Queue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::VirtQueue; use crate::test_utils::multi_region_mem; @@ -429,17 +515,36 @@ mod tests { impl From<&mut [u8]> for IoVecBufferMut { fn from(buf: &mut [u8]) -> Self { + let mut vecs = IovDeque::new().unwrap(); + vecs.push_back(iovec { + iov_base: buf.as_mut_ptr().cast::(), + iov_len: buf.len(), + }); + Self { - vecs: vec![iovec { - iov_base: buf.as_mut_ptr().cast::(), - iov_len: buf.len(), - }] - .into(), - len: buf.len().try_into().unwrap(), + vecs, + len: buf.len(), } } } + impl From> for IoVecBufferMut { + fn from(buffer: Vec<&mut [u8]>) -> Self { + let mut len = 0usize; + let mut vecs = IovDeque::new().unwrap(); + for slice in buffer { + len += slice.len(); + + vecs.push_back(iovec { + iov_base: slice.as_ptr() as *mut c_void, + iov_len: slice.len(), + }); + } + + Self { vecs, len } + } + } + fn default_mem() -> GuestMemoryMmap { multi_region_mem(&[ (GuestAddress(0), 0x10000), @@ -528,8 +633,19 @@ mod tests { let head = q.pop().unwrap(); // SAFETY: This descriptor chain is only loaded once in this test - let iovec = unsafe { IoVecBufferMut::from_descriptor_chain(&mem, head).unwrap() }; + let mut iovec = unsafe { IoVecBufferMut::from_descriptor_chain(&mem, head).unwrap() }; assert_eq!(iovec.len(), 4 * 64); + + // We are creating a new queue where we can get descriptors from. Probably, this is not + // something that we will ever want to do, as `IoVecBufferMut`s are typically + // (concpetually) associated with a single `Queue`. We just do this here to be able to test + // the appending logic. + let (mut q, _) = write_only_chain(&mem); + let head = q.pop().unwrap(); + // SAFETY: it is actually unsafe, but we just want to check the length of the + // `IoVecBufferMut` after appending. + let _ = unsafe { iovec.append_descriptor_chain(&mem, head).unwrap() }; + assert_eq!(iovec.len(), 8 * 64); } #[test] @@ -679,6 +795,7 @@ mod tests { } #[cfg(kani)] +#[allow(dead_code)] // Avoid warning when using stubs mod verification { use std::mem::ManuallyDrop; @@ -687,6 +804,9 @@ mod verification { use vm_memory::VolatileSlice; use super::{IoVecBuffer, IoVecBufferMut, IoVecVec}; + use crate::arch::PAGE_SIZE; + use crate::devices::virtio::iov_deque::IovDeque; + use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; // Maximum memory size to use for our buffers. For the time being 1KB. const GUEST_MEMORY_SIZE: usize = 1 << 10; @@ -698,14 +818,58 @@ mod verification { // >= 1. const MAX_DESC_LENGTH: usize = 4; + mod stubs { + use super::*; + + /// This is a stub for the `IovDeque::push_back` method. + /// + /// `IovDeque` relies on a special allocation of two pages of virtual memory, where both of + /// these point to the same underlying physical page. This way, the contents of the first + /// page of virtual memory are automatically mirrored in the second virtual page. We do + /// that in order to always have the elements that are currently in the ring buffer in + /// consecutive (virtual) memory. + /// + /// To build this particular memory layout we create a new `memfd` object, allocate memory + /// with `mmap` and call `mmap` again to make sure both pages point to the page allocated + /// via the `memfd` object. These ffi calls make kani complain, so here we mock the + /// `IovDeque` object memory with a normal memory allocation of two pages worth of data. + /// + /// This stub helps imitate the effect of mirroring without all the elaborate memory + /// allocation trick. + pub fn push_back(deque: &mut IovDeque, iov: iovec) { + // This should NEVER happen, since our ring buffer is as big as the maximum queue size. + // We also check for the sanity of the VirtIO queues, in queue.rs, which means that if + // we ever try to add something in a full ring buffer, there is an internal + // bug in the device emulation logic. Panic here because the device is + // hopelessly broken. + assert!( + !deque.is_full(), + "The number of `iovec` objects is bigger than the available space" + ); + + let offset = (deque.start + deque.len) as usize; + let mirror = if offset >= FIRECRACKER_MAX_QUEUE_SIZE as usize { + offset - FIRECRACKER_MAX_QUEUE_SIZE as usize + } else { + offset + FIRECRACKER_MAX_QUEUE_SIZE as usize + }; + + // SAFETY: self.iov is a valid pointer and `self.start + self.len` is within range (we + // asserted before that the buffer is not full). + unsafe { deque.iov.add(offset).write_volatile(iov) }; + unsafe { deque.iov.add(mirror).write_volatile(iov) }; + deque.len += 1; + } + } + fn create_iovecs(mem: *mut u8, size: usize, nr_descs: usize) -> (IoVecVec, u32) { let mut vecs: Vec = Vec::with_capacity(nr_descs); let mut len = 0u32; for _ in 0..nr_descs { - // The `IoVecBuffer(Mut)` constructors ensure that the memory region described by every + // The `IoVecBuffer` constructors ensure that the memory region described by every // `Descriptor` in the chain is a valid, i.e. it is memory with then guest's memory // mmap. The assumption, here, that the last address is within the memory object's - // bound substitutes these checks that `IoVecBuffer(Mut)::new() performs.` + // bound substitutes these checks that `IoVecBuffer::new() performs.` let addr: usize = kani::any(); let iov_len: usize = kani::any_where(|&len| matches!(addr.checked_add(len), Some(x) if x <= size)); @@ -728,6 +892,41 @@ mod verification { } } + fn create_iov_deque() -> IovDeque { + // SAFETY: safe because the layout has non-zero size + let mem = unsafe { + std::alloc::alloc(std::alloc::Layout::from_size_align_unchecked( + 2 * PAGE_SIZE, + PAGE_SIZE, + )) + }; + IovDeque { + iov: mem.cast(), + start: kani::any_where(|&start| start < FIRECRACKER_MAX_QUEUE_SIZE), + len: 0, + } + } + + fn create_iovecs_mut(mem: *mut u8, size: usize, nr_descs: usize) -> (IovDeque, u32) { + let mut vecs = create_iov_deque(); + let mut len = 0u32; + for _ in 0..nr_descs { + // The `IoVecBufferMut` constructors ensure that the memory region described by every + // `Descriptor` in the chain is a valid, i.e. it is memory with then guest's memory + // mmap. The assumption, here, that the last address is within the memory object's + // bound substitutes these checks that `IoVecBufferMut::new() performs.` + let addr: usize = kani::any(); + let iov_len: usize = + kani::any_where(|&len| matches!(addr.checked_add(len), Some(x) if x <= size)); + let iov_base = unsafe { mem.offset(addr.try_into().unwrap()) } as *mut c_void; + + vecs.push_back(iovec { iov_base, iov_len }); + len += u32::try_from(iov_len).unwrap(); + } + + (vecs, len) + } + impl IoVecBufferMut { fn any_of_length(nr_descs: usize) -> Self { // We only write into `IoVecBufferMut` objects, so we can simply create a guest memory @@ -739,8 +938,11 @@ mod verification { )) }; - let (vecs, len) = create_iovecs(mem, GUEST_MEMORY_SIZE, nr_descs); - Self { vecs, len } + let (vecs, len) = create_iovecs_mut(mem, GUEST_MEMORY_SIZE, nr_descs); + Self { + vecs, + len: len.try_into().unwrap(), + } } } @@ -815,12 +1017,13 @@ mod verification { #[kani::proof] #[kani::unwind(5)] #[kani::solver(cadical)] + #[kani::stub(IovDeque::push_back, stubs::push_back)] fn verify_write_to_iovec() { for nr_descs in 0..MAX_DESC_LENGTH { let mut iov_mut = IoVecBufferMut::any_of_length(nr_descs); let mut buf = kani::vec::any_vec::(); - let offset: u32 = kani::any(); + let offset: usize = kani::any(); // We can't really check the contents that the operation here writes into // `IoVecBufferMut`, because our `IoVecBufferMut` being completely arbitrary @@ -835,14 +1038,11 @@ mod verification { // Ok(...) assert_eq!( iov_mut - .write_volatile_at( - &mut KaniBuffer(&mut buf), - offset as usize, - GUEST_MEMORY_SIZE - ) + .write_volatile_at(&mut KaniBuffer(&mut buf), offset, GUEST_MEMORY_SIZE) .unwrap(), buf.len().min(iov_mut.len().saturating_sub(offset) as usize) ); + std::mem::forget(iov_mut.vecs); } } } diff --git a/src/vmm/src/devices/virtio/mod.rs b/src/vmm/src/devices/virtio/mod.rs index f68c2a123c9..9931e1211d1 100644 --- a/src/vmm/src/devices/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/mod.rs @@ -16,6 +16,7 @@ pub mod balloon; pub mod block; pub mod device; pub mod gen; +mod iov_deque; pub mod iovec; pub mod mmio; pub mod net; diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index f8c29f95175..ed6f36dcb0b 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -5,14 +5,13 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. -use std::io::Read; +use std::collections::VecDeque; use std::mem; use std::net::Ipv4Addr; use std::sync::{Arc, Mutex}; use libc::EAGAIN; -use log::{error, warn}; -use vm_memory::GuestMemoryError; +use log::error; use vmm_sys_util::eventfd::EventFd; use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; @@ -23,13 +22,15 @@ use crate::devices::virtio::gen::virtio_net::{ VIRTIO_NET_F_HOST_TSO6, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_MAC, }; use crate::devices::virtio::gen::virtio_ring::VIRTIO_RING_F_EVENT_IDX; -use crate::devices::virtio::iovec::IoVecBuffer; +use crate::devices::virtio::iovec::{ + IoVecBuffer, IoVecBufferMut, IoVecError, ParsedDescriptorChain, +}; use crate::devices::virtio::net::metrics::{NetDeviceMetrics, NetMetricsPerDevice}; use crate::devices::virtio::net::tap::Tap; use crate::devices::virtio::net::{ gen, NetError, NetQueue, MAX_BUFFER_SIZE, NET_QUEUE_SIZES, RX_INDEX, TX_INDEX, }; -use crate::devices::virtio::queue::{DescriptorChain, Queue}; +use crate::devices::virtio::queue::{DescriptorChain, Queue, FIRECRACKER_MAX_QUEUE_SIZE}; use crate::devices::virtio::{ActivateError, TYPE_NET}; use crate::devices::{report_net_event_fail, DeviceError}; use crate::dumbo::pdu::arp::ETH_IPV4_FRAME_LEN; @@ -40,24 +41,10 @@ use crate::mmds::ns::MmdsNetworkStack; use crate::rate_limiter::{BucketUpdate, RateLimiter, TokenType}; use crate::utils::net::mac::MacAddr; use crate::utils::u64_to_usize; -use crate::vstate::memory::{ByteValued, Bytes, GuestMemoryMmap}; +use crate::vstate::memory::{ByteValued, GuestMemoryMmap}; const FRAME_HEADER_MAX_LEN: usize = PAYLOAD_OFFSET + ETH_IPV4_FRAME_LEN; -#[derive(Debug, thiserror::Error, displaydoc::Display)] -enum FrontendError { - /// Add user. - AddUsed, - /// Descriptor chain too mall. - DescriptorChainTooSmall, - /// Empty queue. - EmptyQueue, - /// Guest memory error: {0} - GuestMemory(GuestMemoryError), - /// Read only descriptor. - ReadOnlyDescriptor, -} - pub(crate) const fn vnet_hdr_len() -> usize { mem::size_of::() } @@ -102,6 +89,118 @@ pub struct ConfigSpace { // SAFETY: `ConfigSpace` contains only PODs in `repr(C)` or `repr(transparent)`, without padding. unsafe impl ByteValued for ConfigSpace {} +#[derive(Debug, thiserror::Error, displaydoc::Display)] +enum AddRxBufferError { + /// Error while parsing new buffer: {0} + Parsing(#[from] IoVecError), + /// RX buffer is too small + BufferTooSmall, +} + +/// A map of all the memory the guest has provided us with for performing RX +#[derive(Debug)] +pub struct RxBuffers { + // minimum size of a usable buffer for doing RX + pub min_buffer_size: u32, + // An [`IoVecBufferMut`] covering all the memory we have available for receiving network + // frames. + pub iovec: IoVecBufferMut, + // A map of which part of the memory belongs to which `DescriptorChain` object + pub parsed_descriptors: VecDeque, + // Buffers that we have used and they are ready to be given back to the guest. + pub deferred_descriptor: Option, +} + +impl RxBuffers { + /// Create a new [`RxBuffers`] object for storing guest memory for performing RX + fn new() -> Result { + Ok(Self { + min_buffer_size: 0, + iovec: IoVecBufferMut::new()?, + parsed_descriptors: VecDeque::with_capacity(FIRECRACKER_MAX_QUEUE_SIZE.into()), + deferred_descriptor: None, + }) + } + + /// Add a new `DescriptorChain` that we received from the RX queue in the buffer. + /// + /// SAFETY: The `DescriptorChain` cannot be referencing the same memory location as any other + /// `DescriptorChain`. (See also related comment in + /// [`IoVecBufferMut::append_descriptor_chain`]). + unsafe fn add_buffer( + &mut self, + mem: &GuestMemoryMmap, + head: DescriptorChain, + ) -> Result<(), AddRxBufferError> { + let parsed_dc = self.iovec.append_descriptor_chain(mem, head)?; + if parsed_dc.length < self.min_buffer_size { + self.iovec.drop_descriptor_chain(&parsed_dc); + return Err(AddRxBufferError::BufferTooSmall); + } + self.parsed_descriptors.push_back(parsed_dc); + Ok(()) + } + + /// Returns the number of available `iovec` objects. + #[inline(always)] + fn len(&self) -> usize { + self.iovec.len() + } + + /// Returns `true` if there aren't any available `iovec` objects. + #[inline(always)] + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Mark the first `size` bytes of available memory as used. + /// + /// # Safety: + /// + /// * The `RxBuffers` should include at least one parsed `DescriptorChain`. + /// * `size` needs to be smaller or equal to total length of the first `DescriptorChain` stored + /// in the `RxBuffers`. + unsafe fn mark_used(&mut self, size: u32) -> ParsedDescriptorChain { + // Since we were able to write a frame in guest memory, we should have at least one + // descriptor chain here. If not, we have a bug, so fail fast, since the device is + // fundamentally broken. + let mut parsed_dc = self.parsed_descriptors.pop_front().expect( + "net: internal bug. Mismatch between written frame size and available descriptors", + ); + + self.header_set_num_buffers(1); + self.iovec.drop_descriptor_chain(&parsed_dc); + parsed_dc.length = size; + parsed_dc + } + + /// Write the number of descriptors used in VirtIO header + fn header_set_num_buffers(&mut self, nr_descs: u16) { + // We can unwrap here, because we have checked before that the `IoVecBufferMut` holds at + // least one buffer with the proper size, depending on the feature negotiation. In any + // case, the buffer holds memory of at least `std::mem::size_of::()` + // bytes. + self.iovec + .write_all_volatile_at( + &nr_descs.to_le_bytes(), + std::mem::offset_of!(virtio_net_hdr_v1, num_buffers), + ) + .unwrap() + } + + /// This will let the guest know that about all the `DescriptorChain` object that has been + /// used to receive a frame from the TAP. + fn finish_frame(&mut self, dc: &ParsedDescriptorChain, rx_queue: &mut Queue) { + // It is fine to `.unrap()` here. The only reason why `add_used` can fail is if the + // `head_index` is not a valid descriptor id. `head_index` here is a valid + // `DescriptorChain` index. We got it from `queue.pop_or_enable_notification()` which + // checks for its validity. In other words, if this unwrap() fails there's a bug in our + // emulation logic which, most likely, we can't recover from. So, let's crash here + // instead of logging an error and continuing. + rx_queue.add_used(dc.head_index, dc.length).unwrap(); + } +} + /// VirtIO network device. /// /// It emulates a network device able to exchange L2 frames between the guest @@ -122,9 +221,6 @@ pub struct Net { pub(crate) rx_rate_limiter: RateLimiter, pub(crate) tx_rate_limiter: RateLimiter, - pub(crate) rx_deferred_frame: bool, - - rx_bytes_read: usize, rx_frame_buf: [u8; MAX_BUFFER_SIZE], tx_frame_headers: [u8; frame_hdr_len()], @@ -143,6 +239,7 @@ pub struct Net { pub(crate) metrics: Arc, tx_buffer: IoVecBuffer, + pub(crate) rx_buffer: RxBuffers, } impl Net { @@ -189,8 +286,6 @@ impl Net { queue_evts, rx_rate_limiter, tx_rate_limiter, - rx_deferred_frame: false, - rx_bytes_read: 0, rx_frame_buf: [0u8; MAX_BUFFER_SIZE], tx_frame_headers: [0u8; frame_hdr_len()], irq_trigger: IrqTrigger::new().map_err(NetError::EventFd)?, @@ -201,6 +296,7 @@ impl Net { mmds_ns: None, metrics: NetMetricsPerDevice::alloc(id), tx_buffer: Default::default(), + rx_buffer: RxBuffers::new()?, }) } @@ -311,126 +407,50 @@ impl Net { // Attempts to copy a single frame into the guest if there is enough // rate limiting budget. // Returns true on successful frame delivery. - fn rate_limited_rx_single_frame(&mut self) -> bool { - if !Self::rate_limiter_consume_op(&mut self.rx_rate_limiter, self.rx_bytes_read as u64) { + fn rate_limited_rx_single_frame(&mut self, dc: &ParsedDescriptorChain) -> bool { + let rx_queue = &mut self.queues[RX_INDEX]; + if !Self::rate_limiter_consume_op(&mut self.rx_rate_limiter, dc.length as u64) { self.metrics.rx_rate_limiter_throttled.inc(); return false; } - // Attempt frame delivery. - let success = self.write_frame_to_guest(); - - // Undo the tokens consumption if guest delivery failed. - if !success { - // revert the rate limiting budget consumption - Self::rate_limiter_replenish_op(&mut self.rx_rate_limiter, self.rx_bytes_read as u64); - } - - success + self.rx_buffer.finish_frame(dc, rx_queue); + true } - /// Write a slice in a descriptor chain - /// - /// # Errors - /// - /// Returns an error if the descriptor chain is too short or - /// an inappropriate (read only) descriptor is found in the chain - fn write_to_descriptor_chain( - mem: &GuestMemoryMmap, - data: &[u8], - head: DescriptorChain, - net_metrics: &NetDeviceMetrics, - ) -> Result<(), FrontendError> { - let mut chunk = data; - let mut next_descriptor = Some(head); - - while let Some(descriptor) = &next_descriptor { - if !descriptor.is_write_only() { - return Err(FrontendError::ReadOnlyDescriptor); - } - - let len = std::cmp::min(chunk.len(), descriptor.len as usize); - match mem.write_slice(&chunk[..len], descriptor.addr) { - Ok(()) => { - net_metrics.rx_count.inc(); - chunk = &chunk[len..]; - } - Err(err) => { - error!("Failed to write slice: {:?}", err); - if let GuestMemoryError::PartialBuffer { .. } = err { - net_metrics.rx_partial_writes.inc(); - } - return Err(FrontendError::GuestMemory(err)); - } - } - - // If chunk is empty we are done here. - if chunk.is_empty() { - let len = data.len() as u64; - net_metrics.rx_bytes_count.add(len); - net_metrics.rx_packets_count.inc(); - return Ok(()); - } - - next_descriptor = descriptor.next_descriptor(); + /// Returns the minimum size of buffer we expect the guest to provide us depending on the + /// features we have negotiated with it + fn minimum_rx_buffer_size(&self) -> u32 { + if self.has_feature(VIRTIO_NET_F_GUEST_TSO4 as u64) + || self.has_feature(VIRTIO_NET_F_GUEST_TSO6 as u64) + || self.has_feature(VIRTIO_NET_F_GUEST_UFO as u64) + { + 65562 + } else { + 1526 } - - warn!("Receiving buffer is too small to hold frame of current size"); - Err(FrontendError::DescriptorChainTooSmall) } - // Copies a single frame from `self.rx_frame_buf` into the guest. - fn do_write_frame_to_guest(&mut self) -> Result<(), FrontendError> { + /// Parse available RX `DescriptorChains` from the queue + pub fn parse_rx_descriptors(&mut self) { // This is safe since we checked in the event handler that the device is activated. let mem = self.device_state.mem().unwrap(); - let queue = &mut self.queues[RX_INDEX]; - let head_descriptor = queue.pop_or_enable_notification().ok_or_else(|| { - self.metrics.no_rx_avail_buffer.inc(); - FrontendError::EmptyQueue - })?; - let head_index = head_descriptor.index; - - let result = Self::write_to_descriptor_chain( - mem, - &self.rx_frame_buf[..self.rx_bytes_read], - head_descriptor, - &self.metrics, - ); - // Mark the descriptor chain as used. If an error occurred, skip the descriptor chain. - let used_len = if result.is_err() { - self.metrics.rx_fails.inc(); - 0 - } else { - // Safe to unwrap because a frame must be smaller than 2^16 bytes. - u32::try_from(self.rx_bytes_read).unwrap() - }; - queue.add_used(head_index, used_len).map_err(|err| { - error!("Failed to add available descriptor {}: {}", head_index, err); - FrontendError::AddUsed - })?; - - result - } - - // Copies a single frame from `self.rx_frame_buf` into the guest. In case of an error retries - // the operation if possible. Returns true if the operation was successfull. - fn write_frame_to_guest(&mut self) -> bool { - let max_iterations = self.queues[RX_INDEX].actual_size(); - for _ in 0..max_iterations { - match self.do_write_frame_to_guest() { - Ok(()) => return true, - Err(FrontendError::EmptyQueue) | Err(FrontendError::AddUsed) => { - return false; - } - Err(_) => { - // retry - continue; + while let Some(head) = queue.pop_or_enable_notification() { + let index = head.index; + // SAFETY: we are only using this `DescriptorChain` here. + if let Err(err) = unsafe { self.rx_buffer.add_buffer(mem, head) } { + self.metrics.rx_fails.inc(); + error!("net: Could not parse an RX descriptor: {err}"); + // Try to add the bad descriptor to the used ring. + if let Err(err) = queue.add_used(index, 0) { + error!( + "net: Failed to add available RX descriptor {index} while handling a \ + parsing error: {err}", + ); } } } - - false } // Tries to detour the frame to MMDS and if MMDS doesn't accept it, sends it on the host TAP. @@ -508,7 +528,20 @@ impl Net { } // We currently prioritize packets from the MMDS over regular network packets. - fn read_from_mmds_or_tap(&mut self) -> Result { + fn read_from_mmds_or_tap(&mut self) -> Result, NetError> { + // If we don't have any buffers available try to parse more from the RX queue. There might + // be some buffers we didn't get the chance to process, because we got to handle the TAP + // event before the RX queue event. + if self.rx_buffer.is_empty() { + self.parse_rx_descriptors(); + + // If after parsing the RX queue we still don't have any buffers stop processing RX + // frames. + if self.rx_buffer.is_empty() { + return Ok(None); + } + } + if let Some(ns) = self.mmds_ns.as_mut() { if let Some(len) = ns.write_next_frame(frame_bytes_from_buf_mut(&mut self.rx_frame_buf)?) @@ -517,22 +550,48 @@ impl Net { METRICS.mmds.tx_frames.inc(); METRICS.mmds.tx_bytes.add(len as u64); init_vnet_hdr(&mut self.rx_frame_buf); - return Ok(vnet_hdr_len() + len); + self.rx_buffer + .iovec + .write_all_volatile_at(&self.rx_frame_buf[..vnet_hdr_len() + len], 0)?; + // SAFETY: This is safe: + // * We checked that `rx_buffer` includes at least one `DescriptorChain` + // * `rx_frame_buf` has size of `MAX_BUFFER_SIZE` and all `DescriptorChain` objects + // are at least that big. + let dc = unsafe { + self.rx_buffer + .mark_used((vnet_hdr_len() + len).try_into().unwrap()) + }; + + return Ok(Some(dc)); } } - self.read_tap().map_err(NetError::IO) + // SAFETY: this is safe because we ensured that `self.rx_buffer` has at least one + // DescriptorChain parsed in it. + let len = unsafe { self.read_tap().map_err(NetError::IO) }?; + + // SAFETY: This is safe, + // * `rx_buffer` has at least one `DescriptorChain` + // * `read_tap` passes the first `DescriptorChain` to `readv` so we can't have read more + // bytes than its capacity. + let dc = unsafe { self.rx_buffer.mark_used(len.try_into().unwrap()) }; + Ok(Some(dc)) } + /// Read as many frames as possible. fn process_rx(&mut self) -> Result<(), DeviceError> { - // Read as many frames as possible. loop { match self.read_from_mmds_or_tap() { - Ok(count) => { - self.rx_bytes_read = count; + Ok(None) => { + self.metrics.no_rx_avail_buffer.inc(); + break; + } + Ok(Some(dc)) => { self.metrics.rx_count.inc(); - if !self.rate_limited_rx_single_frame() { - self.rx_deferred_frame = true; + self.metrics.rx_bytes_count.add(dc.length as u64); + self.metrics.rx_packets_count.inc(); + if !self.rate_limited_rx_single_frame(&dc) { + self.rx_buffer.deferred_descriptor = Some(dc); break; } } @@ -558,24 +617,18 @@ impl Net { self.try_signal_queue(NetQueue::Rx) } - // Process the deferred frame first, then continue reading from tap. - fn handle_deferred_frame(&mut self) -> Result<(), DeviceError> { - if self.rate_limited_rx_single_frame() { - self.rx_deferred_frame = false; - // process_rx() was interrupted possibly before consuming all - // packets in the tap; try continuing now. - return self.process_rx(); - } - - self.try_signal_queue(NetQueue::Rx) - } - fn resume_rx(&mut self) -> Result<(), DeviceError> { - if self.rx_deferred_frame { - self.handle_deferred_frame() - } else { - Ok(()) + // First try to handle any deferred frame + if let Some(deferred_descriptor) = self.rx_buffer.deferred_descriptor.take() { + // If can't finish sending this frame, re-set it as deferred and return; we can't + // process any more frames from the TAP. + if !self.rate_limited_rx_single_frame(&deferred_descriptor) { + self.rx_buffer.deferred_descriptor = Some(deferred_descriptor); + return Ok(()); + } } + + self.process_rx() } fn process_tx(&mut self) -> Result<(), DeviceError> { @@ -636,7 +689,7 @@ impl Net { &self.metrics, ) .unwrap_or(false); - if frame_consumed_by_mmds && !self.rx_deferred_frame { + if frame_consumed_by_mmds && self.rx_buffer.deferred_descriptor.is_none() { // MMDS consumed this frame/request, let's also try to process the response. process_rx_for_mmds = true; } @@ -715,8 +768,15 @@ impl Net { self.tx_rate_limiter.update_buckets(tx_bytes, tx_ops); } - fn read_tap(&mut self) -> std::io::Result { - self.tap.read(&mut self.rx_frame_buf) + /// Reads a frame from the TAP device inside the first descriptor held by `self.rx_buffer`. + /// + /// # Safety + /// + /// `self.rx_buffer` needs to have at least one descriptor chain parsed + pub unsafe fn read_tap(&mut self) -> std::io::Result { + let nr_iovecs = self.rx_buffer.parsed_descriptors[0].nr_iovecs as usize; + self.tap + .read_iovec(&mut self.rx_buffer.iovec.as_iovec_mut_slice()[..nr_iovecs]) } fn write_tap(tap: &mut Tap, buf: &IoVecBuffer) -> std::io::Result { @@ -734,7 +794,12 @@ impl Net { // rate limiters present but with _very high_ allowed rate error!("Failed to get rx queue event: {:?}", err); self.metrics.event_fails.inc(); - } else if self.rx_rate_limiter.is_blocked() { + return; + } else { + self.parse_rx_descriptors(); + } + + if self.rx_rate_limiter.is_blocked() { self.metrics.rx_rate_limiter_throttled.inc(); } else { // If the limiter is not blocked, resume the receiving of bytes. @@ -747,31 +812,14 @@ impl Net { // This is safe since we checked in the event handler that the device is activated. self.metrics.rx_tap_event_count.inc(); - // While there are no available RX queue buffers and there's a deferred_frame - // don't process any more incoming. Otherwise start processing a frame. In the - // process the deferred_frame flag will be set in order to avoid freezing the - // RX queue. - if self.queues[RX_INDEX].is_empty() && self.rx_deferred_frame { - self.metrics.no_rx_avail_buffer.inc(); - return; - } - // While limiter is blocked, don't process any more incoming. if self.rx_rate_limiter.is_blocked() { self.metrics.rx_rate_limiter_throttled.inc(); return; } - if self.rx_deferred_frame - // Process a deferred frame first if available. Don't read from tap again - // until we manage to receive this deferred frame. - { - self.handle_deferred_frame() - .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); - } else { - self.process_rx() - .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); - } + self.resume_rx() + .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); } /// Process a single TX queue event. @@ -913,6 +961,8 @@ impl VirtioDevice for Net { .set_offload(supported_flags) .map_err(super::super::ActivateError::TapSetOffload)?; + self.rx_buffer.min_buffer_size = self.minimum_rx_buffer_size(); + if self.activate_evt.write(1).is_err() { self.metrics.activate_fails.inc(); return Err(ActivateError::EventFd); @@ -958,6 +1008,14 @@ pub mod tests { use crate::utils::net::mac::{MacAddr, MAC_ADDR_LEN}; use crate::vstate::memory::{Address, GuestMemory}; + /// Write the number of descriptors used in VirtIO header + fn header_set_num_buffers(frame: &mut [u8], nr_descs: u16) { + let bytes = nr_descs.to_le_bytes(); + let offset = std::mem::offset_of!(virtio_net_hdr_v1, num_buffers); + frame[offset] = bytes[0]; + frame[offset + 1] = bytes[1]; + } + #[test] fn test_vnet_helpers() { let mut frame_buf = vec![42u8; vnet_hdr_len() - 1]; @@ -1144,9 +1202,14 @@ pub mod tests { (2, 1000, VIRTQ_DESC_F_WRITE), ], ); - let frame = th.check_rx_deferred_frame(1000); + let mut frame = inject_tap_tx_frame(&th.net(), 1000); + check_metric_after_block!( + th.net().metrics.rx_fails, + 1, + th.event_manager.run_with_timeout(100).unwrap() + ); th.rxq.check_used_elem(0, 0, 0); - + header_set_num_buffers(frame.as_mut_slice(), 1); th.check_rx_queue_resume(&frame); } @@ -1157,9 +1220,10 @@ pub mod tests { th.activate_net(); th.add_desc_chain(NetQueue::Rx, 0, &[(0, 100, VIRTQ_DESC_F_WRITE)]); - let frame = th.check_rx_deferred_frame(1000); + let mut frame = th.check_rx_discarded_buffer(1000); th.rxq.check_used_elem(0, 0, 0); + header_set_num_buffers(frame.as_mut_slice(), 1); th.check_rx_queue_resume(&frame); } @@ -1181,9 +1245,10 @@ pub mod tests { (2, 4096, VIRTQ_DESC_F_WRITE), ], ); - let frame = th.check_rx_deferred_frame(1000); + let mut frame = th.check_rx_discarded_buffer(1000); th.rxq.check_used_elem(0, 0, 0); + header_set_num_buffers(frame.as_mut_slice(), 1); th.check_rx_queue_resume(&frame); } @@ -1212,11 +1277,12 @@ pub mod tests { &[(4, 1000, VIRTQ_DESC_F_WRITE)], ); - // Add valid descriptor chain. - th.add_desc_chain(NetQueue::Rx, 1300, &[(5, 1000, VIRTQ_DESC_F_WRITE)]); + // Add valid descriptor chain. TestHelper does not negotiate any feature offloading so the + // buffers need to be at least 1526 bytes long. + th.add_desc_chain(NetQueue::Rx, 1300, &[(5, 1526, VIRTQ_DESC_F_WRITE)]); // Inject frame to tap and run epoll. - let frame = inject_tap_tx_frame(&th.net(), 1000); + let mut frame = inject_tap_tx_frame(&th.net(), 1000); check_metric_after_block!( th.net().metrics.rx_packets_count, 1, @@ -1231,10 +1297,11 @@ pub mod tests { th.rxq.check_used_elem(1, 3, 0); th.rxq.check_used_elem(2, 4, 0); // Check that the frame wasn't deferred. - assert!(!th.net().rx_deferred_frame); + assert!(th.net().rx_buffer.deferred_descriptor.is_none()); // Check that the frame has been written successfully to the valid Rx descriptor chain. th.rxq .check_used_elem(3, 5, frame.len().try_into().unwrap()); + header_set_num_buffers(frame.as_mut_slice(), 1); th.rxq.dtable[5].check_data(&frame); } @@ -1257,7 +1324,7 @@ pub mod tests { ], ); // Inject frame to tap and run epoll. - let frame = inject_tap_tx_frame(&th.net(), 1000); + let mut frame = inject_tap_tx_frame(&th.net(), 1000); check_metric_after_block!( th.net().metrics.rx_packets_count, 1, @@ -1265,11 +1332,12 @@ pub mod tests { ); // Check that the frame wasn't deferred. - assert!(!th.net().rx_deferred_frame); + assert!(th.net().rx_buffer.deferred_descriptor.is_none()); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 1); assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // Check that the frame has been written successfully to the Rx descriptor chain. + header_set_num_buffers(frame.as_mut_slice(), 1); th.rxq .check_used_elem(0, 3, frame.len().try_into().unwrap()); th.rxq.dtable[3].check_data(&frame[..100]); @@ -1288,16 +1356,24 @@ pub mod tests { th.add_desc_chain( NetQueue::Rx, 0, - &[(0, 500, VIRTQ_DESC_F_WRITE), (1, 500, VIRTQ_DESC_F_WRITE)], + &[ + (0, 500, VIRTQ_DESC_F_WRITE), + (1, 500, VIRTQ_DESC_F_WRITE), + (2, 526, VIRTQ_DESC_F_WRITE), + ], ); th.add_desc_chain( NetQueue::Rx, - 1000, - &[(2, 500, VIRTQ_DESC_F_WRITE), (3, 500, VIRTQ_DESC_F_WRITE)], + 2000, + &[ + (3, 500, VIRTQ_DESC_F_WRITE), + (4, 500, VIRTQ_DESC_F_WRITE), + (5, 526, VIRTQ_DESC_F_WRITE), + ], ); // Inject 2 frames to tap and run epoll. - let frame_1 = inject_tap_tx_frame(&th.net(), 200); - let frame_2 = inject_tap_tx_frame(&th.net(), 300); + let mut frame_1 = inject_tap_tx_frame(&th.net(), 200); + let mut frame_2 = inject_tap_tx_frame(&th.net(), 300); check_metric_after_block!( th.net().metrics.rx_packets_count, 2, @@ -1305,20 +1381,24 @@ pub mod tests { ); // Check that the frames weren't deferred. - assert!(!th.net().rx_deferred_frame); + assert!(th.net().rx_buffer.deferred_descriptor.is_none()); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 2); assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // Check that the 1st frame was written successfully to the 1st Rx descriptor chain. + header_set_num_buffers(frame_1.as_mut_slice(), 1); th.rxq .check_used_elem(0, 0, frame_1.len().try_into().unwrap()); th.rxq.dtable[0].check_data(&frame_1); th.rxq.dtable[1].check_data(&[0; 500]); + th.rxq.dtable[2].check_data(&[0; 526]); // Check that the 2nd frame was written successfully to the 2nd Rx descriptor chain. + header_set_num_buffers(frame_2.as_mut_slice(), 1); th.rxq - .check_used_elem(1, 2, frame_2.len().try_into().unwrap()); - th.rxq.dtable[2].check_data(&frame_2); - th.rxq.dtable[3].check_data(&[0; 500]); + .check_used_elem(1, 3, frame_2.len().try_into().unwrap()); + th.rxq.dtable[3].check_data(&frame_2); + th.rxq.dtable[4].check_data(&[0; 500]); + th.rxq.dtable[2].check_data(&[0; 526]); } #[test] @@ -1605,6 +1685,19 @@ pub mod tests { fn test_mmds_detour_and_injection() { let mut net = default_net(); + // Inject a fake buffer in the devices buffers, otherwise we won't be able to receive the + // MMDS frame. One iovec will be just fine. + let mut fake_buffer = vec![0u8; 1024]; + let iov_buffer = IoVecBufferMut::from(fake_buffer.as_mut_slice()); + net.rx_buffer.iovec = iov_buffer; + net.rx_buffer + .parsed_descriptors + .push_back(ParsedDescriptorChain { + head_index: 1, + length: 1024, + nr_iovecs: 1, + }); + let src_mac = MacAddr::from_str("11:11:11:11:11:11").unwrap(); let src_ip = Ipv4Addr::new(10, 1, 2, 3); let dst_mac = MacAddr::from_str("22:22:22:22:22:22").unwrap(); @@ -1721,8 +1814,12 @@ pub mod tests { // SAFETY: its a valid fd unsafe { libc::close(th.net.lock().unwrap().tap.as_raw_fd()) }; - // The RX queue is empty and rx_deffered_frame is set. - th.net().rx_deferred_frame = true; + // The RX queue is empty and there is a deferred frame. + th.net().rx_buffer.deferred_descriptor = Some(ParsedDescriptorChain { + head_index: 1, + length: 100, + nr_iovecs: 1, + }); check_metric_after_block!( th.net().metrics.no_rx_avail_buffer, 1, @@ -1732,68 +1829,14 @@ pub mod tests { // We need to set this here to false, otherwise the device will try to // handle a deferred frame, it will fail and will never try to read from // the tap. - th.net().rx_deferred_frame = false; + th.net().rx_buffer.deferred_descriptor = None; - // Fake an avail buffer; this time, tap reading should error out. - th.rxq.avail.idx.set(1); - check_metric_after_block!( - th.net().metrics.tap_read_fails, - 1, - th.simulate_event(NetEvent::Tap) - ); - } - - #[test] - fn test_deferred_frame() { - let mem = single_region_mem(2 * MAX_BUFFER_SIZE); - let mut th = TestHelper::get_default(&mem); - th.activate_net(); - - let rx_packets_count = th.net().metrics.rx_packets_count.count(); - let _ = inject_tap_tx_frame(&th.net(), 1000); - // Trigger a Tap event that. This should fail since there - // are not any available descriptors in the queue - check_metric_after_block!( - th.net().metrics.no_rx_avail_buffer, - 1, - th.simulate_event(NetEvent::Tap) - ); - // The frame we read from the tap should be deferred now and - // no frames should have been transmitted - assert!(th.net().rx_deferred_frame); - assert_eq!(th.net().metrics.rx_packets_count.count(), rx_packets_count); - - // Let's add a second frame, which should really have the same - // fate. - let _ = inject_tap_tx_frame(&th.net(), 1000); - - // Adding a descriptor in the queue. This should handle the first deferred - // frame. However, this should try to handle the second tap as well and fail - // since there's only one Descriptor Chain in the queue. th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); check_metric_after_block!( - th.net().metrics.no_rx_avail_buffer, + th.net().metrics.tap_read_fails, 1, th.simulate_event(NetEvent::Tap) ); - // We should still have a deferred frame - assert!(th.net().rx_deferred_frame); - // However, we should have delivered the first frame - assert_eq!( - th.net().metrics.rx_packets_count.count(), - rx_packets_count + 1 - ); - - // Let's add one more descriptor and try to handle the last frame as well. - th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); - check_metric_after_block!( - th.net().metrics.rx_packets_count, - 1, - th.simulate_event(NetEvent::RxQueue) - ); - - // We should be done with any deferred frame - assert!(!th.net().rx_deferred_frame); } #[test] @@ -1908,10 +1951,10 @@ pub mod tests { let mut rl = RateLimiter::new(1000, 0, 500, 0, 0, 0).unwrap(); // set up RX - assert!(!th.net().rx_deferred_frame); + assert!(th.net().rx_buffer.deferred_descriptor.is_none()); th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); - let frame = inject_tap_tx_frame(&th.net(), 1000); + let mut frame = inject_tap_tx_frame(&th.net(), 1000); // use up the budget (do it after injecting the tx frame, as socket communication is // slow enough that the ratelimiter could replenish in the meantime). @@ -1928,7 +1971,7 @@ pub mod tests { // assert that limiter is blocked assert!(th.net().rx_rate_limiter.is_blocked()); assert_eq!(th.net().metrics.rx_rate_limiter_throttled.count(), 1); - assert!(th.net().rx_deferred_frame); + assert!(th.net().rx_buffer.deferred_descriptor.is_some()); // assert that no operation actually completed (limiter blocked it) assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // make sure the data is still queued for processing @@ -1963,6 +2006,7 @@ pub mod tests { assert_eq!(th.rxq.used.idx.get(), 1); th.rxq .check_used_elem(0, 0, frame.len().try_into().unwrap()); + header_set_num_buffers(frame.as_mut_slice(), 1); th.rxq.dtable[0].check_data(&frame); } } @@ -2026,9 +2070,9 @@ pub mod tests { let mut rl = RateLimiter::new(0, 0, 0, 1, 0, 500).unwrap(); // set up RX - assert!(!th.net().rx_deferred_frame); + assert!(th.net().rx_buffer.deferred_descriptor.is_none()); th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); - let frame = inject_tap_tx_frame(&th.net(), 1234); + let mut frame = inject_tap_tx_frame(&th.net(), 1234); // use up the initial budget assert!(rl.consume(1, TokenType::Ops)); @@ -2048,7 +2092,7 @@ pub mod tests { // assert that limiter is blocked assert!(th.net().rx_rate_limiter.is_blocked()); assert!(th.net().metrics.rx_rate_limiter_throttled.count() >= 1); - assert!(th.net().rx_deferred_frame); + assert!(th.net().rx_buffer.deferred_descriptor.is_some()); // assert that no operation actually completed (limiter blocked it) assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // make sure the data is still queued for processing @@ -2075,6 +2119,7 @@ pub mod tests { assert_eq!(th.rxq.used.idx.get(), 1); th.rxq .check_used_elem(0, 0, frame.len().try_into().unwrap()); + header_set_num_buffers(frame.as_mut_slice(), 1); th.rxq.dtable[0].check_data(&frame); } } diff --git a/src/vmm/src/devices/virtio/net/mod.rs b/src/vmm/src/devices/virtio/net/mod.rs index 1a7972595ad..e8a3f86ac72 100644 --- a/src/vmm/src/devices/virtio/net/mod.rs +++ b/src/vmm/src/devices/virtio/net/mod.rs @@ -27,8 +27,10 @@ pub mod test_utils; mod gen; pub use tap::{Tap, TapError}; +use vm_memory::VolatileMemoryError; pub use self::device::Net; +use super::iovec::IoVecError; /// Enum representing the Net device queue types #[derive(Debug)] @@ -50,6 +52,10 @@ pub enum NetError { EventFd(io::Error), /// IO error: {0} IO(io::Error), + /// Error writing in guest memory: {0} + GuestMemoryError(#[from] VolatileMemoryError), /// The VNET header is missing from the frame VnetHeaderMissing, + /// IoVecBuffer(Mut) error: {0} + IoVecError(#[from] IoVecError), } diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index 4f0ae35d966..c7918f07ab3 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -9,9 +9,10 @@ use std::sync::{Arc, Mutex}; use serde::{Deserialize, Serialize}; -use super::device::Net; -use super::{TapError, NET_NUM_QUEUES}; +use super::device::{Net, RxBuffers}; +use super::{TapError, NET_NUM_QUEUES, RX_INDEX}; use crate::devices::virtio::device::DeviceState; +use crate::devices::virtio::iovec::ParsedDescriptorChain; use crate::devices::virtio::persist::{PersistError as VirtioStateError, VirtioDeviceState}; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; use crate::devices::virtio::TYPE_NET; @@ -31,6 +32,23 @@ pub struct NetConfigSpaceState { guest_mac: Option, } +/// Information about the parsed RX buffers +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +pub struct RxBufferState { + // Number of iovecs we have parsed from the guest + parsed_descriptor_chains_nr: u16, + deferred_descriptor: Option, +} + +impl RxBufferState { + fn from_rx_buffers(rx_buffer: &RxBuffers) -> Self { + RxBufferState { + parsed_descriptor_chains_nr: rx_buffer.parsed_descriptors.len().try_into().unwrap(), + deferred_descriptor: rx_buffer.deferred_descriptor.clone(), + } + } +} + /// Information about the network device that are saved /// at snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -43,6 +61,7 @@ pub struct NetState { pub mmds_ns: Option, config_space: NetConfigSpaceState, virtio_state: VirtioDeviceState, + rx_buffers_state: RxBufferState, } /// Auxiliary structure for creating a device when resuming from a snapshot. @@ -85,6 +104,7 @@ impl Persist<'_> for Net { guest_mac: self.guest_mac, }, virtio_state: VirtioDeviceState::from_device(self), + rx_buffers_state: RxBufferState::from_rx_buffers(&self.rx_buffer), } } @@ -137,6 +157,14 @@ impl Persist<'_> for Net { .map_err(NetPersistError::TapSetOffload)?; net.device_state = DeviceState::Activated(constructor_args.mem); + + // Recreate `Net::rx_buffer`. We do it by re-parsing the RX queue. We're temporarily + // rolling back `next_avail` in the RX queue and call `parse_rx_descriptors`. + net.queues[RX_INDEX].next_avail -= state.rx_buffers_state.parsed_descriptor_chains_nr; + net.parse_rx_descriptors(); + net.rx_buffer + .deferred_descriptor + .clone_from(&state.rx_buffers_state.deferred_descriptor); } Ok(net) diff --git a/src/vmm/src/devices/virtio/net/tap.rs b/src/vmm/src/devices/virtio/net/tap.rs index 20024a1ae8e..4d1757edc8e 100644 --- a/src/vmm/src/devices/virtio/net/tap.rs +++ b/src/vmm/src/devices/virtio/net/tap.rs @@ -7,7 +7,7 @@ use std::fmt::{self, Debug}; use std::fs::File; -use std::io::{Error as IoError, Read}; +use std::io::Error as IoError; use std::os::raw::*; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; @@ -190,11 +190,19 @@ impl Tap { } Ok(usize::try_from(ret).unwrap()) } -} -impl Read for Tap { - fn read(&mut self, buf: &mut [u8]) -> Result { - self.tap_file.read(buf) + /// Read from tap to an `IoVecBufferMut` + pub(crate) fn read_iovec(&mut self, buffer: &mut [libc::iovec]) -> Result { + let iov = buffer.as_mut_ptr(); + let iovcnt = buffer.len().try_into().unwrap(); + + // SAFETY: `readv` is safe. Called with a valid tap fd, the iovec pointer and length + // is provide by the `IoVecBufferMut` implementation and we check the return value. + let ret = unsafe { libc::readv(self.tap_file.as_raw_fd(), iov, iovcnt) }; + if ret == -1 { + return Err(IoError::last_os_error()); + } + Ok(usize::try_from(ret).unwrap()) } } @@ -211,6 +219,7 @@ pub mod tests { use std::os::unix::ffi::OsStrExt; use super::*; + use crate::devices::virtio::iovec::IoVecBufferMut; use crate::devices::virtio::net::gen; use crate::devices::virtio::net::test_utils::{enable, if_index, TapTrafficSimulator}; @@ -218,7 +227,6 @@ pub mod tests { const VNET_HDR_SIZE: usize = 10; const PAYLOAD_SIZE: usize = 512; - const PACKET_SIZE: usize = 1024; #[test] fn test_tap_name() { @@ -287,23 +295,6 @@ pub mod tests { assert_eq!(tap.as_raw_fd(), tap.tap_file.as_raw_fd()); } - #[test] - fn test_read() { - let mut tap = Tap::open_named("").unwrap(); - enable(&tap); - let tap_traffic_simulator = TapTrafficSimulator::new(if_index(&tap)); - - let packet = vmm_sys_util::rand::rand_alphanumerics(PAYLOAD_SIZE); - tap_traffic_simulator.push_tx_packet(packet.as_bytes()); - - let mut buf = [0u8; PACKET_SIZE]; - assert_eq!(tap.read(&mut buf).unwrap(), PAYLOAD_SIZE + VNET_HDR_SIZE); - assert_eq!( - &buf[VNET_HDR_SIZE..packet.len() + VNET_HDR_SIZE], - packet.as_bytes() - ); - } - #[test] fn test_write_iovec() { let mut tap = Tap::open_named("").unwrap(); @@ -339,4 +330,26 @@ pub mod tests { fragment3 ); } + + #[test] + fn test_read_iovec() { + let mut tap = Tap::open_named("").unwrap(); + enable(&tap); + let tap_traffic_simulator = TapTrafficSimulator::new(if_index(&tap)); + + let mut buff1 = vec![0; PAYLOAD_SIZE + VNET_HDR_SIZE]; + let mut buff2 = vec![0; 2 * PAYLOAD_SIZE]; + + let mut rx_buffers = IoVecBufferMut::from(vec![buff1.as_mut_slice(), buff2.as_mut_slice()]); + + let packet = vmm_sys_util::rand::rand_alphanumerics(2 * PAYLOAD_SIZE); + tap_traffic_simulator.push_tx_packet(packet.as_bytes()); + assert_eq!( + tap.read_iovec(rx_buffers.as_iovec_mut_slice()).unwrap(), + 2 * PAYLOAD_SIZE + VNET_HDR_SIZE + ); + assert_eq!(&buff1[VNET_HDR_SIZE..], &packet.as_bytes()[..PAYLOAD_SIZE]); + assert_eq!(&buff2[..PAYLOAD_SIZE], &packet.as_bytes()[PAYLOAD_SIZE..]); + assert_eq!(&buff2[PAYLOAD_SIZE..], &vec![0; PAYLOAD_SIZE]) + } } diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs index 07808bbb44b..eb1c6f6e883 100644 --- a/src/vmm/src/devices/virtio/net/test_utils.rs +++ b/src/vmm/src/devices/virtio/net/test_utils.rs @@ -430,8 +430,9 @@ pub mod test { event_fd.write(1).unwrap(); } - /// Generate a tap frame of `frame_len` and check that it is deferred - pub fn check_rx_deferred_frame(&mut self, frame_len: usize) -> Vec { + /// Generate a tap frame of `frame_len` and check that it is not read and + /// the descriptor chain has been discarded + pub fn check_rx_discarded_buffer(&mut self, frame_len: usize) -> Vec { let used_idx = self.rxq.used.idx.get(); // Inject frame to tap and run epoll. @@ -441,8 +442,6 @@ pub mod test { 0, self.event_manager.run_with_timeout(100).unwrap() ); - // Check that the frame has been deferred. - assert!(self.net().rx_deferred_frame); // Check that the descriptor chain has been discarded. assert_eq!(self.rxq.used.idx.get(), used_idx + 1); assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring)); @@ -454,16 +453,9 @@ pub mod test { /// is eventually received by the guest pub fn check_rx_queue_resume(&mut self, expected_frame: &[u8]) { let used_idx = self.rxq.used.idx.get(); - // Add a valid Rx avail descriptor chain and run epoll. - self.add_desc_chain( - NetQueue::Rx, - 0, - &[( - 0, - u32::try_from(expected_frame.len()).unwrap(), - VIRTQ_DESC_F_WRITE, - )], - ); + // Add a valid Rx avail descriptor chain and run epoll. We do not negotiate any feature + // offloading so the buffers need to be at least 1526 bytes long. + self.add_desc_chain(NetQueue::Rx, 0, &[(0, 1526, VIRTQ_DESC_F_WRITE)]); check_metric_after_block!( self.net().metrics.rx_packets_count, 1, diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 4bdea1c40f6..6a73816947a 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -112,7 +112,7 @@ impl Entropy { return Ok(0); } - let mut rand_bytes = vec![0; iovec.len() as usize]; + let mut rand_bytes = vec![0; iovec.len()]; rand::fill(&mut rand_bytes).map_err(|err| { METRICS.host_rng_fails.inc(); err @@ -120,7 +120,9 @@ impl Entropy { // It is ok to unwrap here. We are writing `iovec.len()` bytes at offset 0. iovec.write_all_volatile_at(&rand_bytes, 0).unwrap(); - Ok(iovec.len()) + // It is ok to unwrap here. `iovec` contains only a single `DescriptorChain`, which means + // that its length fit in a u32. + Ok(u32::try_from(iovec.len()).unwrap()) } fn process_entropy_queue(&mut self) { @@ -145,7 +147,7 @@ impl Entropy { // Check for available rate limiting budget. // If not enough budget is available, leave the request descriptor in the queue // to handle once we do have budget. - if !Self::rate_limit_request(&mut self.rate_limiter, u64::from(iovec.len())) { + if !Self::rate_limit_request(&mut self.rate_limiter, iovec.len() as u64) { debug!("entropy: throttling entropy queue"); METRICS.entropy_rate_limiter_throttled.inc(); self.queues[RNG_QUEUE].undo_pop(); diff --git a/src/vmm/src/devices/virtio/vsock/mod.rs b/src/vmm/src/devices/virtio/vsock/mod.rs index 7fdc86aed2e..879097fef45 100644 --- a/src/vmm/src/devices/virtio/vsock/mod.rs +++ b/src/vmm/src/devices/virtio/vsock/mod.rs @@ -30,6 +30,7 @@ pub use self::defs::uapi::VIRTIO_ID_VSOCK as TYPE_VSOCK; pub use self::defs::VSOCK_DEV_ID; pub use self::device::Vsock; pub use self::unix::{VsockUnixBackend, VsockUnixBackendError}; +use super::iov_deque::IovDequeError; use crate::devices::virtio::iovec::IoVecError; use crate::devices::virtio::persist::PersistError as VirtioStateError; @@ -138,6 +139,8 @@ pub enum VsockError { VirtioState(VirtioStateError), /// Vsock uds backend error: {0} VsockUdsBackend(VsockUnixBackendError), + /// Underlying IovDeque error: {0} + IovDeque(IovDequeError), } impl From for VsockError { @@ -147,6 +150,7 @@ impl From for VsockError { IoVecError::ReadOnlyDescriptor => VsockError::UnwritableDescriptor, IoVecError::GuestMemory(err) => VsockError::GuestMemoryMmap(err), IoVecError::OverflowedDescriptor => VsockError::DescChainOverflow, + IoVecError::IovDeque(err) => VsockError::IovDeque(err), } } } diff --git a/src/vmm/src/devices/virtio/vsock/packet.rs b/src/vmm/src/devices/virtio/vsock/packet.rs index d63dd41386e..f43c586330c 100644 --- a/src/vmm/src/devices/virtio/vsock/packet.rs +++ b/src/vmm/src/devices/virtio/vsock/packet.rs @@ -172,8 +172,10 @@ impl VsockPacket { // are live at the same time, meaning this has exclusive ownership over the memory let buffer = unsafe { IoVecBufferMut::from_descriptor_chain(mem, chain)? }; - if buffer.len() < VSOCK_PKT_HDR_SIZE { - return Err(VsockError::DescChainTooShortForHeader(buffer.len() as usize)); + // It is ok to unwrap the conversion from usize to u32, because the `buffer` only contains + // a single `DescriptorChain`, so its length fits in a u32. + if (u32::try_from(buffer.len()).unwrap()) < VSOCK_PKT_HDR_SIZE { + return Err(VsockError::DescChainTooShortForHeader(buffer.len())); } Ok(Self { @@ -222,7 +224,7 @@ impl VsockPacket { pub fn buf_size(&self) -> u32 { let chain_length = match self.buffer { VsockPacketBuffer::Tx(ref iovec_buf) => iovec_buf.len(), - VsockPacketBuffer::Rx(ref iovec_buf) => iovec_buf.len(), + VsockPacketBuffer::Rx(ref iovec_buf) => u32::try_from(iovec_buf.len()).unwrap(), }; chain_length - VSOCK_PKT_HDR_SIZE } @@ -237,8 +239,8 @@ impl VsockPacket { VsockPacketBuffer::Tx(_) => Err(VsockError::UnwritableDescriptor), VsockPacketBuffer::Rx(ref mut buffer) => { if count - > buffer - .len() + > u32::try_from(buffer.len()) + .unwrap() .saturating_sub(VSOCK_PKT_HDR_SIZE) .saturating_sub(offset) {