diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index f0a27b6e40d..868e7ce0e99 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -32,108 +32,6 @@ "syscall": "writev", "comment": "Used by the VirtIO net device to write to tap" }, - { - "syscall": "readv", - "comment": "Used by the VirtIO net device to read from tap" - }, - { - "syscall": "memfd_create", - "comment": "Used by the IovDeque implementation" - }, - { - "syscall": "fcntl", - "comment": "Used by the IovDeque implementation", - "args": [ - { - "index": 1, - "type": "dword", - "op": "eq", - "val": 1033, - "comment": "FCNTL_F_SETFD" - }, - { - "index": 2, - "type": "dword", - "op": "eq", - "val": 6, - "comment": "F_SEAL_SHRINK|F_SEAL_GROW" - } - ] - }, - { - "syscall": "fcntl", - "comment": "Used by the IovDeque implementation", - "args": [ - { - "index": 1, - "type": "dword", - "op": "eq", - "val": 1033, - "comment": "FCNTL_F_SETFD" - }, - { - "index": 2, - "type": "dword", - "op": "eq", - "val": 1, - "comment": "F_SEAL_SEAL" - } - ] - }, - { - "syscall": "mmap", - "comment": "Used by the IovDeque implementation", - "args": [ - { - "index": 1, - "type": "dword", - "op": "eq", - "val": 4096, - "comment": "Page size allocation" - }, - { - "index": 2, - "type": "dword", - "op": "eq", - "val": 3, - "comment": "PROT_READ|PROT_WRITE" - }, - { - "index": 3, - "type": "dword", - "op": "eq", - "val": 17, - "comment": "MAP_SHARED|MAP_FIXED" - } - ] - }, - { - "syscall": "mmap", - "comment": "Used by the IovDeque implementation", - "args": [ - { - "index": 1, - "type": "dword", - "op": "eq", - "val": 8192, - "comment": "2 pages allocation" - }, - { - "index": 2, - "type": "dword", - "op": "eq", - "val": 0, - "comment": "PROT_NONE" - }, - { - "index": 3, - "type": "dword", - "op": "eq", - "val": 34, - "comment": "MAP_PRIVATE|MAP_ANONYMOUS" - } - ] - }, { "syscall": "fsync" }, diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index 630211f47d1..e5b4b690196 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -32,108 +32,6 @@ "syscall": "writev", "comment": "Used by the VirtIO net device to write to tap" }, - { - "syscall": "readv", - "comment": "Used by the VirtIO net device to read from tap" - }, - { - "syscall": "memfd_create", - "comment": "Used by the IovDeque implementation" - }, - { - "syscall": "fcntl", - "comment": "Used by the IovDeque implementation", - "args": [ - { - "index": 1, - "type": "dword", - "op": "eq", - "val": 1033, - "comment": "FCNTL_F_SETFD" - }, - { - "index": 2, - "type": "dword", - "op": "eq", - "val": 6, - "comment": "F_SEAL_SHRINK|F_SEAL_GROW" - } - ] - }, - { - "syscall": "fcntl", - "comment": "Used by the IovDeque implementation", - "args": [ - { - "index": 1, - "type": "dword", - "op": "eq", - "val": 1033, - "comment": "FCNTL_F_SETFD" - }, - { - "index": 2, - "type": "dword", - "op": "eq", - "val": 1, - "comment": "F_SEAL_SEAL" - } - ] - }, - { - "syscall": "mmap", - "comment": "Used by the IovDeque implementation", - "args": [ - { - "index": 1, - "type": "dword", - "op": "eq", - "val": 4096, - "comment": "Page size allocation" - }, - { - "index": 2, - "type": "dword", - "op": "eq", - "val": 3, - "comment": "PROT_READ|PROT_WRITE" - }, - { - "index": 3, - "type": "dword", - "op": "eq", - "val": 17, - "comment": "MAP_SHARED|MAP_FIXED" - } - ] - }, - { - "syscall": "mmap", - "comment": "Used by the IovDeque implementation", - "args": [ - { - "index": 1, - "type": "dword", - "op": "eq", - "val": 8192, - "comment": "2 pages allocation" - }, - { - "index": 2, - "type": "dword", - "op": "eq", - "val": 0, - "comment": "PROT_NONE" - }, - { - "index": 3, - "type": "dword", - "op": "eq", - "val": 34, - "comment": "MAP_PRIVATE|MAP_ANONYMOUS" - } - ] - }, { "syscall": "fsync" }, diff --git a/src/vmm/src/devices/virtio/iov_deque.rs b/src/vmm/src/devices/virtio/iov_deque.rs deleted file mode 100644 index 0d801d10f3d..00000000000 --- a/src/vmm/src/devices/virtio/iov_deque.rs +++ /dev/null @@ -1,449 +0,0 @@ -// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -use std::os::fd::AsRawFd; - -use libc::{c_int, c_void, iovec, off_t, size_t}; -use memfd; - -use super::queue::FIRECRACKER_MAX_QUEUE_SIZE; -use crate::arch::PAGE_SIZE; - -#[derive(Debug, thiserror::Error, displaydoc::Display)] -pub enum IovDequeError { - /// Error with memfd: {0} - Memfd(#[from] memfd::Error), - /// Error while resizing memfd: {0} - MemfdResize(std::io::Error), - /// Error calling mmap: {0} - Mmap(std::io::Error), -} - -/// ['IovDeque'] is a ring buffer tailored for `struct iovec` objects. -/// -/// From the point of view of API, [`IovDeque`] is a typical ring buffer that allows us to push -/// `struct iovec` objects at the end of the buffer and pop them from its beginning. -/// -/// It is tailored to store `struct iovec` objects that described memory that was passed to us from -/// the guest via a VirtIO queue. This allows us to assume the maximum size of a ring buffer (the -/// negotiated size of the queue). -// An important feature of the data structure is that it can give us a slice of all `struct iovec` -// objects in the queue, so that we can use this `&mut [iovec]` to perform operations such as -// `readv`. A typical implementation of a ring buffer allows for entries to wrap around the end of -// the underlying buffer. For example, a ring buffer with a capacity of 10 elements which -// currently holds 4 elements can look like this: -// -// tail head -// | | -// v v -// +---+---+---+---+---+---+---+---+---+---+ -// ring buffer: | C | D | | | | | | | A | B | -// +---+---+---+---+---+---+---+---+---+---+ -// -// When getting a slice for this data we should get something like that: &[A, B, C, D], which -// would require copies in order to make the elements continuous in memory. -// -// In order to avoid that and make the operation of getting a slice more efficient, we implement -// the optimization described in the "Optimization" section of the "Circular buffer" wikipedia -// entry: https://en.wikipedia.org/wiki/Circular_buffer. The optimization consists of allocating -// double the size of the virtual memory required for the buffer and map both parts on the same -// physical address. Looking at the same example as before, we should get, this picture: -// -// head | tail -// | | | -// v | v -// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ -// | C | D | | | | | | | A | B | C | D | | | | | | | A | B | -// +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ -// First virtual page | Second virtual page -// | -// | -// -// Virtual memory -// --------------------------------------------------------------------------------------- -// Physical memory -// -// +---+---+---+---+---+---+---+---+---+---+ -// | C | D | | | | | | | A | B | -// +---+---+---+---+---+---+---+---+---+---+ -// -// Like that, the elements stored in the buffer are always laid out in contiguous virtual memory, -// so making a slice out of them does not require any copies. -#[derive(Debug)] -pub struct IovDeque { - pub iov: *mut libc::iovec, - pub start: u16, - pub len: u16, -} - -// SAFETY: This is `Send`. We hold sole ownership of the underlying buffer. -unsafe impl Send for IovDeque {} - -impl IovDeque { - /// Create a [`memfd`] object that represents a single physical page - fn create_memfd() -> Result { - // Create a sealable memfd. - let opts = memfd::MemfdOptions::default().allow_sealing(true); - let mfd = opts.create("sized-1K")?; - - // Resize to system page size. - mfd.as_file() - .set_len(PAGE_SIZE.try_into().unwrap()) - .map_err(IovDequeError::MemfdResize)?; - - // Add seals to prevent further resizing. - mfd.add_seals(&[memfd::FileSeal::SealShrink, memfd::FileSeal::SealGrow])?; - - // Prevent further sealing changes. - mfd.add_seal(memfd::FileSeal::SealSeal)?; - - Ok(mfd) - } - - /// A safe wrapper on top of libc's `mmap` system call - /// - /// # Safety: Callers need to make sure that the arguments to `mmap` are valid - unsafe fn mmap( - addr: *mut c_void, - len: size_t, - prot: c_int, - flags: c_int, - fd: c_int, - offset: off_t, - ) -> Result<*mut c_void, IovDequeError> { - let ptr = libc::mmap(addr, len, prot, flags, fd, offset); - if ptr == libc::MAP_FAILED { - return Err(IovDequeError::Mmap(std::io::Error::last_os_error())); - } - - Ok(ptr) - } - - /// Allocate memory for our ring buffer - /// - /// This will allocate exactly two pages of virtual memory. In order to implement the - /// optimization that allows us to always have elements in contiguous memory we need - /// allocations at the granularity of `PAGE_SIZE`. Now, our queues are at maximum 256 - /// descriptors long and `struct iovec` looks like this: - /// - /// ```Rust - /// pub struct iovec { - /// pub iov_base: *mut ::c_void, - /// pub iov_len: ::size_t, - /// } - /// ``` - /// - /// so, it's 16 bytes long. As a result, we need a single page for holding the actual data of - /// our buffer. - fn allocate_ring_buffer_memory() -> Result<*mut c_void, IovDequeError> { - // The fact that we allocate two pages is due to the size of `struct iovec` times our queue - // size equals the page size. Add here a debug assertion to reflect that and ensure that we - // will adapt our logic if the assumption changes in the future. - const { - assert!( - std::mem::size_of::() * FIRECRACKER_MAX_QUEUE_SIZE as usize == PAGE_SIZE - ); - } - - // SAFETY: We are calling the system call with valid arguments - unsafe { - Self::mmap( - std::ptr::null_mut(), - PAGE_SIZE * 2, - libc::PROT_NONE, - libc::MAP_PRIVATE | libc::MAP_ANONYMOUS, - -1, - 0, - ) - } - } - - /// Create a new [`IovDeque`] that can hold memory described by a single VirtIO queue. - pub fn new() -> Result { - let memfd = Self::create_memfd()?; - let raw_memfd = memfd.as_file().as_raw_fd(); - let buffer = Self::allocate_ring_buffer_memory()?; - - // Map the first page of virtual memory to the physical page described by the memfd object - // SAFETY: We are calling the system call with valid arguments - let _ = unsafe { - Self::mmap( - buffer, - PAGE_SIZE, - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_SHARED | libc::MAP_FIXED, - raw_memfd, - 0, - ) - }?; - - // Map the second page of virtual memory to the physical page described by the memfd object - // - // SAFETY: This is safe because: - // * Both `buffer` and the result of `buffer.add(PAGE_SIZE)` are within bounds of the - // allocation we got from `Self::allocate_ring_buffer_memory`. - // * The computed offset is `PAGE_SIZE * size_of::() == PAGE_SIZE bytes` which fits - // in `isize` - // * The resulting pointer is the beginning of the second page of our allocation, so it - // doesn't wrap around the address space. - let next_page = unsafe { buffer.add(PAGE_SIZE) }; - - // SAFETY: We are calling the system call with valid arguments - let _ = unsafe { - Self::mmap( - next_page, - PAGE_SIZE, - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_SHARED | libc::MAP_FIXED, - raw_memfd, - 0, - ) - }?; - - Ok(Self { - iov: buffer.cast(), - start: 0, - len: 0, - }) - } - - /// Returns the number of `iovec` objects currently in the [`IovDeque`] - #[inline(always)] - pub fn len(&self) -> u16 { - self.len - } - - /// Returns `true` if the [`IovDeque`] is full, `false` otherwise - #[inline(always)] - pub fn is_full(&self) -> bool { - self.len() == FIRECRACKER_MAX_QUEUE_SIZE - } - - /// Resets the queue, dropping all its elements. - #[inline(always)] - pub fn clear(&mut self) { - self.start = 0; - self.len = 0; - } - - /// Adds an `iovec` in the ring buffer. - /// - /// Returns an `IovDequeError::Full` error if the buffer is full. - pub fn push_back(&mut self, iov: iovec) { - // This should NEVER happen, since our ring buffer is as big as the maximum queue size. - // We also check for the sanity of the VirtIO queues, in queue.rs, which means that if we - // ever try to add something in a full ring buffer, there is an internal bug in the device - // emulation logic. Panic here because the device is hopelessly broken. - assert!( - !self.is_full(), - "The number of `iovec` objects is bigger than the available space" - ); - - // SAFETY: self.iov is a valid pointer and `self.start + self.len` is within range (we - // asserted before that the buffer is not full). - unsafe { - self.iov - .add((self.start + self.len) as usize) - .write_volatile(iov) - }; - self.len += 1; - } - - /// Pops the first `nr_iovecs` iovecs from the buffer. - /// - /// Returns the total number of bytes of all the popped iovecs. This will panic if we are asked - /// to pop more iovecs than what is currently available in the buffer. - pub fn pop_front(&mut self, nr_iovecs: u16) { - assert!( - self.len() >= nr_iovecs, - "Internal bug! Trying to drop more iovec objects than what is available" - ); - - self.start += nr_iovecs; - self.len -= nr_iovecs; - if self.start >= FIRECRACKER_MAX_QUEUE_SIZE { - self.start -= FIRECRACKER_MAX_QUEUE_SIZE; - } - } - - /// Get a slice of the iovec objects currently in the buffer. - pub fn as_slice(&self) -> &[iovec] { - // SAFETY: Here we create a slice out of the existing elements in the buffer (not the whole - // allocated memory). That means that we can: - // * We can read `self.len * mem::size_of::()` bytes out of the memory range we are - // returning. - // * `self.iov.add(self.start.into())` is a non-null pointer and aligned. - // * The underlying memory comes from a single allocation. - // * The returning pointer points to `self.len` consecutive initialized `iovec` objects. - // * We are only accessing the underlying memory through the returned slice. Since we are - // returning a slice of only the existing pushed elements the slice does not contain any - // aliasing references. - // * The slice can be up to 1 page long which is smaller than `isize::MAX`. - unsafe { - let slice_start = self.iov.add(self.start.into()); - std::slice::from_raw_parts(slice_start, self.len.into()) - } - } - - /// Get a mutable slice of the iovec objects currently in the buffer. - pub fn as_mut_slice(&mut self) -> &mut [iovec] { - // SAFETY: Here we create a slice out of the existing elements in the buffer (not the whole - // allocated memory). That means that we can: - // * We can read/write `self.len * mem::size_of::()` bytes out of the memory range we - // are returning. - // * The underlying memory comes from a single allocation. - // * `self.iov.add(self.start.into())` is a non-null pointer and aligned - // * The returning pointer points to `self.len` consecutive initialized `iovec` objects. - // * We are only accessing the underlying memory through the returned slice. Since we are - // returning a slice of only the existing pushed elements the slice does not contain any - // aliasing references. - // * The slice can be up to 1 page long which is smaller than `isize::MAX`. - unsafe { - let slice_start = self.iov.add(self.start.into()); - std::slice::from_raw_parts_mut(slice_start, self.len.into()) - } - } -} - -impl Drop for IovDeque { - fn drop(&mut self) { - // SAFETY: We are passing an address that we got from a previous allocation of `2 * - // PAGE_SIZE` bytes by calling mmap - let _ = unsafe { libc::munmap(self.iov.cast(), PAGE_SIZE * 2) }; - } -} - -#[cfg(test)] -mod tests { - use libc::iovec; - - use super::IovDeque; - - #[test] - fn test_new() { - let deque = IovDeque::new().unwrap(); - assert_eq!(deque.len(), 0); - } - - fn make_iovec(id: u16, len: u16) -> iovec { - iovec { - iov_base: id as *mut libc::c_void, - iov_len: len as usize, - } - } - - #[test] - #[should_panic] - fn test_push_back_too_many() { - let mut deque = IovDeque::new().unwrap(); - assert_eq!(deque.len(), 0); - - for i in 0u16..256 { - deque.push_back(make_iovec(i, i)); - assert_eq!(deque.len(), i + 1); - } - - deque.push_back(make_iovec(0, 0)); - } - - #[test] - #[should_panic] - fn test_pop_front_from_empty() { - let mut deque = IovDeque::new().unwrap(); - deque.pop_front(1); - } - - #[test] - #[should_panic] - fn test_pop_front_too_many() { - let mut deque = IovDeque::new().unwrap(); - deque.push_back(make_iovec(42, 42)); - deque.pop_front(2); - } - - #[test] - fn test_pop() { - let mut deque = IovDeque::new().unwrap(); - assert_eq!(deque.len(), 0); - assert!(!deque.is_full()); - deque.pop_front(0); - - for i in 0u16..256 { - deque.push_back(make_iovec(i, i)); - assert_eq!(deque.len(), i + 1); - } - - assert!(deque.is_full()); - assert!(deque.len() != 0); - - for i in 0u16..256 { - deque.pop_front(1); - assert_eq!(deque.len(), 256 - i - 1); - } - } - - #[test] - fn test_pop_many() { - let mut deque = IovDeque::new().unwrap(); - - for i in 0u16..256 { - deque.push_back(make_iovec(i, i)); - } - - deque.pop_front(1); - assert_eq!(deque.len(), 255); - deque.pop_front(2); - assert_eq!(deque.len(), 253); - deque.pop_front(4); - assert_eq!(deque.len(), 249); - deque.pop_front(8); - assert_eq!(deque.len(), 241); - deque.pop_front(16); - assert_eq!(deque.len(), 225); - deque.pop_front(32); - assert_eq!(deque.len(), 193); - deque.pop_front(64); - assert_eq!(deque.len(), 129); - deque.pop_front(128); - assert_eq!(deque.len(), 1); - } - - #[test] - fn test_as_slice() { - let mut deque = IovDeque::new().unwrap(); - assert!(deque.as_slice().is_empty()); - - for i in 0..256 { - deque.push_back(make_iovec(i, 100)); - assert_eq!(deque.as_slice().len(), (i + 1) as usize); - } - let copy: Vec = deque.as_slice().to_vec(); - - assert_eq!(copy.len(), deque.len() as usize); - for (i, iov) in deque.as_slice().iter().enumerate() { - assert_eq!(iov.iov_len, copy[i].iov_len); - } - } - - #[test] - fn test_as_mut_slice() { - let mut deque = IovDeque::new().unwrap(); - assert!(deque.as_mut_slice().is_empty()); - - for i in 0..256 { - deque.push_back(make_iovec(i, 100)); - assert_eq!(deque.as_mut_slice().len(), (i + 1) as usize); - } - - let copy: Vec = deque.as_mut_slice().to_vec(); - deque - .as_mut_slice() - .iter_mut() - .for_each(|iov| iov.iov_len *= 2); - - assert_eq!(copy.len(), deque.len() as usize); - for (i, iov) in deque.as_slice().iter().enumerate() { - assert_eq!(iov.iov_len, 2 * copy[i].iov_len); - } - } -} diff --git a/src/vmm/src/devices/virtio/iovec.rs b/src/vmm/src/devices/virtio/iovec.rs index 5a015975df0..3acde02fc05 100644 --- a/src/vmm/src/devices/virtio/iovec.rs +++ b/src/vmm/src/devices/virtio/iovec.rs @@ -4,15 +4,12 @@ use std::io::ErrorKind; use libc::{c_void, iovec, size_t}; -use serde::{Deserialize, Serialize}; -#[cfg(not(kani))] use smallvec::SmallVec; use vm_memory::bitmap::Bitmap; use vm_memory::{ GuestMemory, GuestMemoryError, ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile, }; -use super::iov_deque::{IovDeque, IovDequeError}; use crate::devices::virtio::queue::DescriptorChain; use crate::vstate::memory::GuestMemoryMmap; @@ -26,8 +23,6 @@ pub enum IoVecError { OverflowedDescriptor, /// Guest memory error: {0} GuestMemory(#[from] GuestMemoryError), - /// Error with underlying `IovDeque`: {0} - IovDeque(#[from] IovDequeError), } // Using SmallVec in the kani proofs causes kani to use unbounded amounts of memory @@ -219,55 +214,42 @@ impl IoVecBuffer { } } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ParsedDescriptorChain { - pub head_index: u16, - pub length: u32, - pub nr_iovecs: u16, -} - /// This is essentially a wrapper of a `Vec` which can be passed to `libc::readv`. /// /// It describes a write-only buffer passed to us by the guest that is scattered across multiple /// memory regions. Additionally, this wrapper provides methods that allow reading arbitrary ranges /// of data from that buffer. -#[derive(Debug)] +#[derive(Debug, Default, Clone)] pub struct IoVecBufferMut { // container of the memory regions included in this IO vector - pub vecs: IovDeque, + vecs: IoVecVec, // Total length of the IoVecBufferMut - pub len: usize, + len: u32, } impl IoVecBufferMut { - /// Parse a `DescriptorChain` object and append the memory regions it describes in the - /// underlying ring buffer. + /// Create an `IoVecBuffer` from a `DescriptorChain` /// /// # Safety /// /// The descriptor chain cannot be referencing the same memory location as another chain - unsafe fn parse_descriptor( + pub unsafe fn load_descriptor_chain( &mut self, mem: &GuestMemoryMmap, head: DescriptorChain, - ) -> Result { - let head_index = head.index; + ) -> Result<(), IoVecError> { + self.clear(); + let mut next_descriptor = Some(head); - let mut length = 0u32; - let mut nr_iovecs = 0u16; while let Some(desc) = next_descriptor { if !desc.is_write_only() { - self.vecs.pop_front(nr_iovecs); return Err(IoVecError::ReadOnlyDescriptor); } // We use get_slice instead of `get_host_address` here in order to have the whole // range of the descriptor chain checked, i.e. [addr, addr + len) is a valid memory // region in the GuestMemoryMmap. - let slice = mem.get_slice(desc.addr, desc.len as usize).map_err(|err| { - self.vecs.pop_front(nr_iovecs); - err - })?; + let slice = mem.get_slice(desc.addr, desc.len as usize)?; // We need to mark the area of guest memory that will be mutated through this // IoVecBufferMut as dirty ahead of time, as we loose access to all @@ -275,79 +257,21 @@ impl IoVecBufferMut { slice.bitmap().mark_dirty(0, desc.len as usize); let iov_base = slice.ptr_guard_mut().as_ptr().cast::(); - self.vecs.push_back(iovec { + self.vecs.push(iovec { iov_base, iov_len: desc.len as size_t, }); - nr_iovecs += 1; - length = length + self.len = self + .len .checked_add(desc.len) - .ok_or(IoVecError::OverflowedDescriptor) - .map_err(|err| { - self.vecs.pop_front(nr_iovecs); - err - })?; + .ok_or(IoVecError::OverflowedDescriptor)?; next_descriptor = desc.next_descriptor(); } - self.len = self.len.checked_add(length as usize).ok_or_else(|| { - self.vecs.pop_front(nr_iovecs); - IoVecError::OverflowedDescriptor - })?; - - Ok(ParsedDescriptorChain { - head_index, - length, - nr_iovecs, - }) - } - - /// Create an empty `IoVecBufferMut`. - pub fn new() -> Result { - let vecs = IovDeque::new()?; - Ok(Self { vecs, len: 0 }) - } - - /// Create an `IoVecBufferMut` from a `DescriptorChain` - /// - /// This will clear any previous `iovec` objects in the buffer and load the new - /// [`DescriptorChain`]. - /// - /// # Safety - /// - /// The descriptor chain cannot be referencing the same memory location as another chain - pub unsafe fn load_descriptor_chain( - &mut self, - mem: &GuestMemoryMmap, - head: DescriptorChain, - ) -> Result<(), IoVecError> { - self.clear(); - let _ = self.parse_descriptor(mem, head)?; Ok(()) } - /// Append a `DescriptorChain` in this `IoVecBufferMut` - /// - /// # Safety - /// - /// The descriptor chain cannot be referencing the same memory location as another chain - pub unsafe fn append_descriptor_chain( - &mut self, - mem: &GuestMemoryMmap, - head: DescriptorChain, - ) -> Result { - self.parse_descriptor(mem, head) - } - - /// Drop memory from the `IoVecBufferMut` - /// - /// This will drop memory described by the `IoVecBufferMut` from the beginning. - pub fn drop_descriptor_chain(&mut self, parse_descriptor: &ParsedDescriptorChain) { - self.vecs.pop_front(parse_descriptor.nr_iovecs); - self.len -= parse_descriptor.length as usize; - } - /// Create an `IoVecBuffer` from a `DescriptorChain` /// /// # Safety @@ -357,29 +281,20 @@ impl IoVecBufferMut { mem: &GuestMemoryMmap, head: DescriptorChain, ) -> Result { - let mut new_buffer = Self::new()?; + let mut new_buffer = Self::default(); new_buffer.load_descriptor_chain(mem, head)?; Ok(new_buffer) } /// Get the total length of the memory regions covered by this `IoVecBuffer` - /// - /// In contrast to the equivalent [`IoVecBuffer::len()`] which returns `u32`, this one returns - /// `usize` since the buffer can contain multiple `DescriptorChain` objects, so we don't have - /// the limit that the length of a buffer is limited by `u32`. - pub(crate) fn len(&self) -> usize { + pub(crate) fn len(&self) -> u32 { self.len } - /// Returns a pointer to the memory keeping the `iovec` structs - pub fn as_iovec_mut_slice(&mut self) -> &mut [iovec] { - self.vecs.as_mut_slice() - } - /// Clears the `iovec` array pub fn clear(&mut self) { self.vecs.clear(); - self.len = 0; + self.len = 0u32; } /// Writes a number of bytes into the `IoVecBufferMut` starting at a given offset. @@ -398,7 +313,7 @@ impl IoVecBufferMut { mut buf: &[u8], offset: usize, ) -> Result<(), VolatileMemoryError> { - if offset < self.len() { + if offset < self.len() as usize { let expected = buf.len(); let bytes_written = self.write_volatile_at(&mut buf, offset, expected)?; @@ -427,7 +342,7 @@ impl IoVecBufferMut { ) -> Result { let mut total_bytes_read = 0; - for iov in self.vecs.as_slice() { + for iov in &self.vecs { if len == 0 { break; } @@ -476,7 +391,6 @@ mod tests { use vm_memory::VolatileMemoryError; use super::{IoVecBuffer, IoVecBufferMut}; - use crate::devices::virtio::iov_deque::IovDeque; use crate::devices::virtio::queue::{Queue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::VirtQueue; use crate::test_utils::multi_region_mem; @@ -515,33 +429,14 @@ mod tests { impl From<&mut [u8]> for IoVecBufferMut { fn from(buf: &mut [u8]) -> Self { - let mut vecs = IovDeque::new().unwrap(); - vecs.push_back(iovec { - iov_base: buf.as_mut_ptr().cast::(), - iov_len: buf.len(), - }); - Self { - vecs, - len: buf.len(), - } - } - } - - impl From> for IoVecBufferMut { - fn from(buffer: Vec<&mut [u8]>) -> Self { - let mut len = 0usize; - let mut vecs = IovDeque::new().unwrap(); - for slice in buffer { - len += slice.len(); - - vecs.push_back(iovec { - iov_base: slice.as_ptr() as *mut c_void, - iov_len: slice.len(), - }); + vecs: vec![iovec { + iov_base: buf.as_mut_ptr().cast::(), + iov_len: buf.len(), + }] + .into(), + len: buf.len().try_into().unwrap(), } - - Self { vecs, len } } } @@ -633,19 +528,8 @@ mod tests { let head = q.pop().unwrap(); // SAFETY: This descriptor chain is only loaded once in this test - let mut iovec = unsafe { IoVecBufferMut::from_descriptor_chain(&mem, head).unwrap() }; + let iovec = unsafe { IoVecBufferMut::from_descriptor_chain(&mem, head).unwrap() }; assert_eq!(iovec.len(), 4 * 64); - - // We are creating a new queue where we can get descriptors from. Probably, this is not - // something that we will ever want to do, as `IoVecBufferMut`s are typically - // (concpetually) associated with a single `Queue`. We just do this here to be able to test - // the appending logic. - let (mut q, _) = write_only_chain(&mem); - let head = q.pop().unwrap(); - // SAFETY: it is actually unsafe, but we just want to check the length of the - // `IoVecBufferMut` after appending. - let _ = unsafe { iovec.append_descriptor_chain(&mem, head).unwrap() }; - assert_eq!(iovec.len(), 8 * 64); } #[test] @@ -795,7 +679,6 @@ mod tests { } #[cfg(kani)] -#[allow(dead_code)] // Avoid warning when using stubs mod verification { use std::mem::ManuallyDrop; @@ -804,9 +687,6 @@ mod verification { use vm_memory::VolatileSlice; use super::{IoVecBuffer, IoVecBufferMut, IoVecVec}; - use crate::arch::PAGE_SIZE; - use crate::devices::virtio::iov_deque::IovDeque; - use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; // Maximum memory size to use for our buffers. For the time being 1KB. const GUEST_MEMORY_SIZE: usize = 1 << 10; @@ -818,58 +698,14 @@ mod verification { // >= 1. const MAX_DESC_LENGTH: usize = 4; - mod stubs { - use super::*; - - /// This is a stub for the `IovDeque::push_back` method. - /// - /// `IovDeque` relies on a special allocation of two pages of virtual memory, where both of - /// these point to the same underlying physical page. This way, the contents of the first - /// page of virtual memory are automatically mirrored in the second virtual page. We do - /// that in order to always have the elements that are currently in the ring buffer in - /// consecutive (virtual) memory. - /// - /// To build this particular memory layout we create a new `memfd` object, allocate memory - /// with `mmap` and call `mmap` again to make sure both pages point to the page allocated - /// via the `memfd` object. These ffi calls make kani complain, so here we mock the - /// `IovDeque` object memory with a normal memory allocation of two pages worth of data. - /// - /// This stub helps imitate the effect of mirroring without all the elaborate memory - /// allocation trick. - pub fn push_back(deque: &mut IovDeque, iov: iovec) { - // This should NEVER happen, since our ring buffer is as big as the maximum queue size. - // We also check for the sanity of the VirtIO queues, in queue.rs, which means that if - // we ever try to add something in a full ring buffer, there is an internal - // bug in the device emulation logic. Panic here because the device is - // hopelessly broken. - assert!( - !deque.is_full(), - "The number of `iovec` objects is bigger than the available space" - ); - - let offset = (deque.start + deque.len) as usize; - let mirror = if offset >= FIRECRACKER_MAX_QUEUE_SIZE as usize { - offset - FIRECRACKER_MAX_QUEUE_SIZE as usize - } else { - offset + FIRECRACKER_MAX_QUEUE_SIZE as usize - }; - - // SAFETY: self.iov is a valid pointer and `self.start + self.len` is within range (we - // asserted before that the buffer is not full). - unsafe { deque.iov.add(offset).write_volatile(iov) }; - unsafe { deque.iov.add(mirror).write_volatile(iov) }; - deque.len += 1; - } - } - fn create_iovecs(mem: *mut u8, size: usize, nr_descs: usize) -> (IoVecVec, u32) { let mut vecs: Vec = Vec::with_capacity(nr_descs); let mut len = 0u32; for _ in 0..nr_descs { - // The `IoVecBuffer` constructors ensure that the memory region described by every + // The `IoVecBuffer(Mut)` constructors ensure that the memory region described by every // `Descriptor` in the chain is a valid, i.e. it is memory with then guest's memory // mmap. The assumption, here, that the last address is within the memory object's - // bound substitutes these checks that `IoVecBuffer::new() performs.` + // bound substitutes these checks that `IoVecBuffer(Mut)::new() performs.` let addr: usize = kani::any(); let iov_len: usize = kani::any_where(|&len| matches!(addr.checked_add(len), Some(x) if x <= size)); @@ -892,41 +728,6 @@ mod verification { } } - fn create_iov_deque() -> IovDeque { - // SAFETY: safe because the layout has non-zero size - let mem = unsafe { - std::alloc::alloc(std::alloc::Layout::from_size_align_unchecked( - 2 * PAGE_SIZE, - PAGE_SIZE, - )) - }; - IovDeque { - iov: mem.cast(), - start: kani::any_where(|&start| start < FIRECRACKER_MAX_QUEUE_SIZE), - len: 0, - } - } - - fn create_iovecs_mut(mem: *mut u8, size: usize, nr_descs: usize) -> (IovDeque, u32) { - let mut vecs = create_iov_deque(); - let mut len = 0u32; - for _ in 0..nr_descs { - // The `IoVecBufferMut` constructors ensure that the memory region described by every - // `Descriptor` in the chain is a valid, i.e. it is memory with then guest's memory - // mmap. The assumption, here, that the last address is within the memory object's - // bound substitutes these checks that `IoVecBufferMut::new() performs.` - let addr: usize = kani::any(); - let iov_len: usize = - kani::any_where(|&len| matches!(addr.checked_add(len), Some(x) if x <= size)); - let iov_base = unsafe { mem.offset(addr.try_into().unwrap()) } as *mut c_void; - - vecs.push_back(iovec { iov_base, iov_len }); - len += u32::try_from(iov_len).unwrap(); - } - - (vecs, len) - } - impl IoVecBufferMut { fn any_of_length(nr_descs: usize) -> Self { // We only write into `IoVecBufferMut` objects, so we can simply create a guest memory @@ -938,11 +739,8 @@ mod verification { )) }; - let (vecs, len) = create_iovecs_mut(mem, GUEST_MEMORY_SIZE, nr_descs); - Self { - vecs, - len: len.try_into().unwrap(), - } + let (vecs, len) = create_iovecs(mem, GUEST_MEMORY_SIZE, nr_descs); + Self { vecs, len } } } @@ -1017,13 +815,12 @@ mod verification { #[kani::proof] #[kani::unwind(5)] #[kani::solver(cadical)] - #[kani::stub(IovDeque::push_back, stubs::push_back)] fn verify_write_to_iovec() { for nr_descs in 0..MAX_DESC_LENGTH { let mut iov_mut = IoVecBufferMut::any_of_length(nr_descs); let mut buf = kani::vec::any_vec::(); - let offset: usize = kani::any(); + let offset: u32 = kani::any(); // We can't really check the contents that the operation here writes into // `IoVecBufferMut`, because our `IoVecBufferMut` being completely arbitrary @@ -1038,11 +835,14 @@ mod verification { // Ok(...) assert_eq!( iov_mut - .write_volatile_at(&mut KaniBuffer(&mut buf), offset, GUEST_MEMORY_SIZE) + .write_volatile_at( + &mut KaniBuffer(&mut buf), + offset as usize, + GUEST_MEMORY_SIZE + ) .unwrap(), buf.len().min(iov_mut.len().saturating_sub(offset) as usize) ); - std::mem::forget(iov_mut.vecs); } } } diff --git a/src/vmm/src/devices/virtio/mod.rs b/src/vmm/src/devices/virtio/mod.rs index 9931e1211d1..f68c2a123c9 100644 --- a/src/vmm/src/devices/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/mod.rs @@ -16,7 +16,6 @@ pub mod balloon; pub mod block; pub mod device; pub mod gen; -mod iov_deque; pub mod iovec; pub mod mmio; pub mod net; diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index ed6f36dcb0b..f8c29f95175 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -5,13 +5,14 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. -use std::collections::VecDeque; +use std::io::Read; use std::mem; use std::net::Ipv4Addr; use std::sync::{Arc, Mutex}; use libc::EAGAIN; -use log::error; +use log::{error, warn}; +use vm_memory::GuestMemoryError; use vmm_sys_util::eventfd::EventFd; use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; @@ -22,15 +23,13 @@ use crate::devices::virtio::gen::virtio_net::{ VIRTIO_NET_F_HOST_TSO6, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_MAC, }; use crate::devices::virtio::gen::virtio_ring::VIRTIO_RING_F_EVENT_IDX; -use crate::devices::virtio::iovec::{ - IoVecBuffer, IoVecBufferMut, IoVecError, ParsedDescriptorChain, -}; +use crate::devices::virtio::iovec::IoVecBuffer; use crate::devices::virtio::net::metrics::{NetDeviceMetrics, NetMetricsPerDevice}; use crate::devices::virtio::net::tap::Tap; use crate::devices::virtio::net::{ gen, NetError, NetQueue, MAX_BUFFER_SIZE, NET_QUEUE_SIZES, RX_INDEX, TX_INDEX, }; -use crate::devices::virtio::queue::{DescriptorChain, Queue, FIRECRACKER_MAX_QUEUE_SIZE}; +use crate::devices::virtio::queue::{DescriptorChain, Queue}; use crate::devices::virtio::{ActivateError, TYPE_NET}; use crate::devices::{report_net_event_fail, DeviceError}; use crate::dumbo::pdu::arp::ETH_IPV4_FRAME_LEN; @@ -41,10 +40,24 @@ use crate::mmds::ns::MmdsNetworkStack; use crate::rate_limiter::{BucketUpdate, RateLimiter, TokenType}; use crate::utils::net::mac::MacAddr; use crate::utils::u64_to_usize; -use crate::vstate::memory::{ByteValued, GuestMemoryMmap}; +use crate::vstate::memory::{ByteValued, Bytes, GuestMemoryMmap}; const FRAME_HEADER_MAX_LEN: usize = PAYLOAD_OFFSET + ETH_IPV4_FRAME_LEN; +#[derive(Debug, thiserror::Error, displaydoc::Display)] +enum FrontendError { + /// Add user. + AddUsed, + /// Descriptor chain too mall. + DescriptorChainTooSmall, + /// Empty queue. + EmptyQueue, + /// Guest memory error: {0} + GuestMemory(GuestMemoryError), + /// Read only descriptor. + ReadOnlyDescriptor, +} + pub(crate) const fn vnet_hdr_len() -> usize { mem::size_of::() } @@ -89,118 +102,6 @@ pub struct ConfigSpace { // SAFETY: `ConfigSpace` contains only PODs in `repr(C)` or `repr(transparent)`, without padding. unsafe impl ByteValued for ConfigSpace {} -#[derive(Debug, thiserror::Error, displaydoc::Display)] -enum AddRxBufferError { - /// Error while parsing new buffer: {0} - Parsing(#[from] IoVecError), - /// RX buffer is too small - BufferTooSmall, -} - -/// A map of all the memory the guest has provided us with for performing RX -#[derive(Debug)] -pub struct RxBuffers { - // minimum size of a usable buffer for doing RX - pub min_buffer_size: u32, - // An [`IoVecBufferMut`] covering all the memory we have available for receiving network - // frames. - pub iovec: IoVecBufferMut, - // A map of which part of the memory belongs to which `DescriptorChain` object - pub parsed_descriptors: VecDeque, - // Buffers that we have used and they are ready to be given back to the guest. - pub deferred_descriptor: Option, -} - -impl RxBuffers { - /// Create a new [`RxBuffers`] object for storing guest memory for performing RX - fn new() -> Result { - Ok(Self { - min_buffer_size: 0, - iovec: IoVecBufferMut::new()?, - parsed_descriptors: VecDeque::with_capacity(FIRECRACKER_MAX_QUEUE_SIZE.into()), - deferred_descriptor: None, - }) - } - - /// Add a new `DescriptorChain` that we received from the RX queue in the buffer. - /// - /// SAFETY: The `DescriptorChain` cannot be referencing the same memory location as any other - /// `DescriptorChain`. (See also related comment in - /// [`IoVecBufferMut::append_descriptor_chain`]). - unsafe fn add_buffer( - &mut self, - mem: &GuestMemoryMmap, - head: DescriptorChain, - ) -> Result<(), AddRxBufferError> { - let parsed_dc = self.iovec.append_descriptor_chain(mem, head)?; - if parsed_dc.length < self.min_buffer_size { - self.iovec.drop_descriptor_chain(&parsed_dc); - return Err(AddRxBufferError::BufferTooSmall); - } - self.parsed_descriptors.push_back(parsed_dc); - Ok(()) - } - - /// Returns the number of available `iovec` objects. - #[inline(always)] - fn len(&self) -> usize { - self.iovec.len() - } - - /// Returns `true` if there aren't any available `iovec` objects. - #[inline(always)] - fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Mark the first `size` bytes of available memory as used. - /// - /// # Safety: - /// - /// * The `RxBuffers` should include at least one parsed `DescriptorChain`. - /// * `size` needs to be smaller or equal to total length of the first `DescriptorChain` stored - /// in the `RxBuffers`. - unsafe fn mark_used(&mut self, size: u32) -> ParsedDescriptorChain { - // Since we were able to write a frame in guest memory, we should have at least one - // descriptor chain here. If not, we have a bug, so fail fast, since the device is - // fundamentally broken. - let mut parsed_dc = self.parsed_descriptors.pop_front().expect( - "net: internal bug. Mismatch between written frame size and available descriptors", - ); - - self.header_set_num_buffers(1); - self.iovec.drop_descriptor_chain(&parsed_dc); - parsed_dc.length = size; - parsed_dc - } - - /// Write the number of descriptors used in VirtIO header - fn header_set_num_buffers(&mut self, nr_descs: u16) { - // We can unwrap here, because we have checked before that the `IoVecBufferMut` holds at - // least one buffer with the proper size, depending on the feature negotiation. In any - // case, the buffer holds memory of at least `std::mem::size_of::()` - // bytes. - self.iovec - .write_all_volatile_at( - &nr_descs.to_le_bytes(), - std::mem::offset_of!(virtio_net_hdr_v1, num_buffers), - ) - .unwrap() - } - - /// This will let the guest know that about all the `DescriptorChain` object that has been - /// used to receive a frame from the TAP. - fn finish_frame(&mut self, dc: &ParsedDescriptorChain, rx_queue: &mut Queue) { - // It is fine to `.unrap()` here. The only reason why `add_used` can fail is if the - // `head_index` is not a valid descriptor id. `head_index` here is a valid - // `DescriptorChain` index. We got it from `queue.pop_or_enable_notification()` which - // checks for its validity. In other words, if this unwrap() fails there's a bug in our - // emulation logic which, most likely, we can't recover from. So, let's crash here - // instead of logging an error and continuing. - rx_queue.add_used(dc.head_index, dc.length).unwrap(); - } -} - /// VirtIO network device. /// /// It emulates a network device able to exchange L2 frames between the guest @@ -221,6 +122,9 @@ pub struct Net { pub(crate) rx_rate_limiter: RateLimiter, pub(crate) tx_rate_limiter: RateLimiter, + pub(crate) rx_deferred_frame: bool, + + rx_bytes_read: usize, rx_frame_buf: [u8; MAX_BUFFER_SIZE], tx_frame_headers: [u8; frame_hdr_len()], @@ -239,7 +143,6 @@ pub struct Net { pub(crate) metrics: Arc, tx_buffer: IoVecBuffer, - pub(crate) rx_buffer: RxBuffers, } impl Net { @@ -286,6 +189,8 @@ impl Net { queue_evts, rx_rate_limiter, tx_rate_limiter, + rx_deferred_frame: false, + rx_bytes_read: 0, rx_frame_buf: [0u8; MAX_BUFFER_SIZE], tx_frame_headers: [0u8; frame_hdr_len()], irq_trigger: IrqTrigger::new().map_err(NetError::EventFd)?, @@ -296,7 +201,6 @@ impl Net { mmds_ns: None, metrics: NetMetricsPerDevice::alloc(id), tx_buffer: Default::default(), - rx_buffer: RxBuffers::new()?, }) } @@ -407,50 +311,126 @@ impl Net { // Attempts to copy a single frame into the guest if there is enough // rate limiting budget. // Returns true on successful frame delivery. - fn rate_limited_rx_single_frame(&mut self, dc: &ParsedDescriptorChain) -> bool { - let rx_queue = &mut self.queues[RX_INDEX]; - if !Self::rate_limiter_consume_op(&mut self.rx_rate_limiter, dc.length as u64) { + fn rate_limited_rx_single_frame(&mut self) -> bool { + if !Self::rate_limiter_consume_op(&mut self.rx_rate_limiter, self.rx_bytes_read as u64) { self.metrics.rx_rate_limiter_throttled.inc(); return false; } - self.rx_buffer.finish_frame(dc, rx_queue); - true + // Attempt frame delivery. + let success = self.write_frame_to_guest(); + + // Undo the tokens consumption if guest delivery failed. + if !success { + // revert the rate limiting budget consumption + Self::rate_limiter_replenish_op(&mut self.rx_rate_limiter, self.rx_bytes_read as u64); + } + + success } - /// Returns the minimum size of buffer we expect the guest to provide us depending on the - /// features we have negotiated with it - fn minimum_rx_buffer_size(&self) -> u32 { - if self.has_feature(VIRTIO_NET_F_GUEST_TSO4 as u64) - || self.has_feature(VIRTIO_NET_F_GUEST_TSO6 as u64) - || self.has_feature(VIRTIO_NET_F_GUEST_UFO as u64) - { - 65562 - } else { - 1526 + /// Write a slice in a descriptor chain + /// + /// # Errors + /// + /// Returns an error if the descriptor chain is too short or + /// an inappropriate (read only) descriptor is found in the chain + fn write_to_descriptor_chain( + mem: &GuestMemoryMmap, + data: &[u8], + head: DescriptorChain, + net_metrics: &NetDeviceMetrics, + ) -> Result<(), FrontendError> { + let mut chunk = data; + let mut next_descriptor = Some(head); + + while let Some(descriptor) = &next_descriptor { + if !descriptor.is_write_only() { + return Err(FrontendError::ReadOnlyDescriptor); + } + + let len = std::cmp::min(chunk.len(), descriptor.len as usize); + match mem.write_slice(&chunk[..len], descriptor.addr) { + Ok(()) => { + net_metrics.rx_count.inc(); + chunk = &chunk[len..]; + } + Err(err) => { + error!("Failed to write slice: {:?}", err); + if let GuestMemoryError::PartialBuffer { .. } = err { + net_metrics.rx_partial_writes.inc(); + } + return Err(FrontendError::GuestMemory(err)); + } + } + + // If chunk is empty we are done here. + if chunk.is_empty() { + let len = data.len() as u64; + net_metrics.rx_bytes_count.add(len); + net_metrics.rx_packets_count.inc(); + return Ok(()); + } + + next_descriptor = descriptor.next_descriptor(); } + + warn!("Receiving buffer is too small to hold frame of current size"); + Err(FrontendError::DescriptorChainTooSmall) } - /// Parse available RX `DescriptorChains` from the queue - pub fn parse_rx_descriptors(&mut self) { + // Copies a single frame from `self.rx_frame_buf` into the guest. + fn do_write_frame_to_guest(&mut self) -> Result<(), FrontendError> { // This is safe since we checked in the event handler that the device is activated. let mem = self.device_state.mem().unwrap(); + let queue = &mut self.queues[RX_INDEX]; - while let Some(head) = queue.pop_or_enable_notification() { - let index = head.index; - // SAFETY: we are only using this `DescriptorChain` here. - if let Err(err) = unsafe { self.rx_buffer.add_buffer(mem, head) } { - self.metrics.rx_fails.inc(); - error!("net: Could not parse an RX descriptor: {err}"); - // Try to add the bad descriptor to the used ring. - if let Err(err) = queue.add_used(index, 0) { - error!( - "net: Failed to add available RX descriptor {index} while handling a \ - parsing error: {err}", - ); + let head_descriptor = queue.pop_or_enable_notification().ok_or_else(|| { + self.metrics.no_rx_avail_buffer.inc(); + FrontendError::EmptyQueue + })?; + let head_index = head_descriptor.index; + + let result = Self::write_to_descriptor_chain( + mem, + &self.rx_frame_buf[..self.rx_bytes_read], + head_descriptor, + &self.metrics, + ); + // Mark the descriptor chain as used. If an error occurred, skip the descriptor chain. + let used_len = if result.is_err() { + self.metrics.rx_fails.inc(); + 0 + } else { + // Safe to unwrap because a frame must be smaller than 2^16 bytes. + u32::try_from(self.rx_bytes_read).unwrap() + }; + queue.add_used(head_index, used_len).map_err(|err| { + error!("Failed to add available descriptor {}: {}", head_index, err); + FrontendError::AddUsed + })?; + + result + } + + // Copies a single frame from `self.rx_frame_buf` into the guest. In case of an error retries + // the operation if possible. Returns true if the operation was successfull. + fn write_frame_to_guest(&mut self) -> bool { + let max_iterations = self.queues[RX_INDEX].actual_size(); + for _ in 0..max_iterations { + match self.do_write_frame_to_guest() { + Ok(()) => return true, + Err(FrontendError::EmptyQueue) | Err(FrontendError::AddUsed) => { + return false; + } + Err(_) => { + // retry + continue; } } } + + false } // Tries to detour the frame to MMDS and if MMDS doesn't accept it, sends it on the host TAP. @@ -528,20 +508,7 @@ impl Net { } // We currently prioritize packets from the MMDS over regular network packets. - fn read_from_mmds_or_tap(&mut self) -> Result, NetError> { - // If we don't have any buffers available try to parse more from the RX queue. There might - // be some buffers we didn't get the chance to process, because we got to handle the TAP - // event before the RX queue event. - if self.rx_buffer.is_empty() { - self.parse_rx_descriptors(); - - // If after parsing the RX queue we still don't have any buffers stop processing RX - // frames. - if self.rx_buffer.is_empty() { - return Ok(None); - } - } - + fn read_from_mmds_or_tap(&mut self) -> Result { if let Some(ns) = self.mmds_ns.as_mut() { if let Some(len) = ns.write_next_frame(frame_bytes_from_buf_mut(&mut self.rx_frame_buf)?) @@ -550,48 +517,22 @@ impl Net { METRICS.mmds.tx_frames.inc(); METRICS.mmds.tx_bytes.add(len as u64); init_vnet_hdr(&mut self.rx_frame_buf); - self.rx_buffer - .iovec - .write_all_volatile_at(&self.rx_frame_buf[..vnet_hdr_len() + len], 0)?; - // SAFETY: This is safe: - // * We checked that `rx_buffer` includes at least one `DescriptorChain` - // * `rx_frame_buf` has size of `MAX_BUFFER_SIZE` and all `DescriptorChain` objects - // are at least that big. - let dc = unsafe { - self.rx_buffer - .mark_used((vnet_hdr_len() + len).try_into().unwrap()) - }; - - return Ok(Some(dc)); + return Ok(vnet_hdr_len() + len); } } - // SAFETY: this is safe because we ensured that `self.rx_buffer` has at least one - // DescriptorChain parsed in it. - let len = unsafe { self.read_tap().map_err(NetError::IO) }?; - - // SAFETY: This is safe, - // * `rx_buffer` has at least one `DescriptorChain` - // * `read_tap` passes the first `DescriptorChain` to `readv` so we can't have read more - // bytes than its capacity. - let dc = unsafe { self.rx_buffer.mark_used(len.try_into().unwrap()) }; - Ok(Some(dc)) + self.read_tap().map_err(NetError::IO) } - /// Read as many frames as possible. fn process_rx(&mut self) -> Result<(), DeviceError> { + // Read as many frames as possible. loop { match self.read_from_mmds_or_tap() { - Ok(None) => { - self.metrics.no_rx_avail_buffer.inc(); - break; - } - Ok(Some(dc)) => { + Ok(count) => { + self.rx_bytes_read = count; self.metrics.rx_count.inc(); - self.metrics.rx_bytes_count.add(dc.length as u64); - self.metrics.rx_packets_count.inc(); - if !self.rate_limited_rx_single_frame(&dc) { - self.rx_buffer.deferred_descriptor = Some(dc); + if !self.rate_limited_rx_single_frame() { + self.rx_deferred_frame = true; break; } } @@ -617,18 +558,24 @@ impl Net { self.try_signal_queue(NetQueue::Rx) } - fn resume_rx(&mut self) -> Result<(), DeviceError> { - // First try to handle any deferred frame - if let Some(deferred_descriptor) = self.rx_buffer.deferred_descriptor.take() { - // If can't finish sending this frame, re-set it as deferred and return; we can't - // process any more frames from the TAP. - if !self.rate_limited_rx_single_frame(&deferred_descriptor) { - self.rx_buffer.deferred_descriptor = Some(deferred_descriptor); - return Ok(()); - } + // Process the deferred frame first, then continue reading from tap. + fn handle_deferred_frame(&mut self) -> Result<(), DeviceError> { + if self.rate_limited_rx_single_frame() { + self.rx_deferred_frame = false; + // process_rx() was interrupted possibly before consuming all + // packets in the tap; try continuing now. + return self.process_rx(); } - self.process_rx() + self.try_signal_queue(NetQueue::Rx) + } + + fn resume_rx(&mut self) -> Result<(), DeviceError> { + if self.rx_deferred_frame { + self.handle_deferred_frame() + } else { + Ok(()) + } } fn process_tx(&mut self) -> Result<(), DeviceError> { @@ -689,7 +636,7 @@ impl Net { &self.metrics, ) .unwrap_or(false); - if frame_consumed_by_mmds && self.rx_buffer.deferred_descriptor.is_none() { + if frame_consumed_by_mmds && !self.rx_deferred_frame { // MMDS consumed this frame/request, let's also try to process the response. process_rx_for_mmds = true; } @@ -768,15 +715,8 @@ impl Net { self.tx_rate_limiter.update_buckets(tx_bytes, tx_ops); } - /// Reads a frame from the TAP device inside the first descriptor held by `self.rx_buffer`. - /// - /// # Safety - /// - /// `self.rx_buffer` needs to have at least one descriptor chain parsed - pub unsafe fn read_tap(&mut self) -> std::io::Result { - let nr_iovecs = self.rx_buffer.parsed_descriptors[0].nr_iovecs as usize; - self.tap - .read_iovec(&mut self.rx_buffer.iovec.as_iovec_mut_slice()[..nr_iovecs]) + fn read_tap(&mut self) -> std::io::Result { + self.tap.read(&mut self.rx_frame_buf) } fn write_tap(tap: &mut Tap, buf: &IoVecBuffer) -> std::io::Result { @@ -794,12 +734,7 @@ impl Net { // rate limiters present but with _very high_ allowed rate error!("Failed to get rx queue event: {:?}", err); self.metrics.event_fails.inc(); - return; - } else { - self.parse_rx_descriptors(); - } - - if self.rx_rate_limiter.is_blocked() { + } else if self.rx_rate_limiter.is_blocked() { self.metrics.rx_rate_limiter_throttled.inc(); } else { // If the limiter is not blocked, resume the receiving of bytes. @@ -812,14 +747,31 @@ impl Net { // This is safe since we checked in the event handler that the device is activated. self.metrics.rx_tap_event_count.inc(); + // While there are no available RX queue buffers and there's a deferred_frame + // don't process any more incoming. Otherwise start processing a frame. In the + // process the deferred_frame flag will be set in order to avoid freezing the + // RX queue. + if self.queues[RX_INDEX].is_empty() && self.rx_deferred_frame { + self.metrics.no_rx_avail_buffer.inc(); + return; + } + // While limiter is blocked, don't process any more incoming. if self.rx_rate_limiter.is_blocked() { self.metrics.rx_rate_limiter_throttled.inc(); return; } - self.resume_rx() - .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); + if self.rx_deferred_frame + // Process a deferred frame first if available. Don't read from tap again + // until we manage to receive this deferred frame. + { + self.handle_deferred_frame() + .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); + } else { + self.process_rx() + .unwrap_or_else(|err| report_net_event_fail(&self.metrics, err)); + } } /// Process a single TX queue event. @@ -961,8 +913,6 @@ impl VirtioDevice for Net { .set_offload(supported_flags) .map_err(super::super::ActivateError::TapSetOffload)?; - self.rx_buffer.min_buffer_size = self.minimum_rx_buffer_size(); - if self.activate_evt.write(1).is_err() { self.metrics.activate_fails.inc(); return Err(ActivateError::EventFd); @@ -1008,14 +958,6 @@ pub mod tests { use crate::utils::net::mac::{MacAddr, MAC_ADDR_LEN}; use crate::vstate::memory::{Address, GuestMemory}; - /// Write the number of descriptors used in VirtIO header - fn header_set_num_buffers(frame: &mut [u8], nr_descs: u16) { - let bytes = nr_descs.to_le_bytes(); - let offset = std::mem::offset_of!(virtio_net_hdr_v1, num_buffers); - frame[offset] = bytes[0]; - frame[offset + 1] = bytes[1]; - } - #[test] fn test_vnet_helpers() { let mut frame_buf = vec![42u8; vnet_hdr_len() - 1]; @@ -1202,14 +1144,9 @@ pub mod tests { (2, 1000, VIRTQ_DESC_F_WRITE), ], ); - let mut frame = inject_tap_tx_frame(&th.net(), 1000); - check_metric_after_block!( - th.net().metrics.rx_fails, - 1, - th.event_manager.run_with_timeout(100).unwrap() - ); + let frame = th.check_rx_deferred_frame(1000); th.rxq.check_used_elem(0, 0, 0); - header_set_num_buffers(frame.as_mut_slice(), 1); + th.check_rx_queue_resume(&frame); } @@ -1220,10 +1157,9 @@ pub mod tests { th.activate_net(); th.add_desc_chain(NetQueue::Rx, 0, &[(0, 100, VIRTQ_DESC_F_WRITE)]); - let mut frame = th.check_rx_discarded_buffer(1000); + let frame = th.check_rx_deferred_frame(1000); th.rxq.check_used_elem(0, 0, 0); - header_set_num_buffers(frame.as_mut_slice(), 1); th.check_rx_queue_resume(&frame); } @@ -1245,10 +1181,9 @@ pub mod tests { (2, 4096, VIRTQ_DESC_F_WRITE), ], ); - let mut frame = th.check_rx_discarded_buffer(1000); + let frame = th.check_rx_deferred_frame(1000); th.rxq.check_used_elem(0, 0, 0); - header_set_num_buffers(frame.as_mut_slice(), 1); th.check_rx_queue_resume(&frame); } @@ -1277,12 +1212,11 @@ pub mod tests { &[(4, 1000, VIRTQ_DESC_F_WRITE)], ); - // Add valid descriptor chain. TestHelper does not negotiate any feature offloading so the - // buffers need to be at least 1526 bytes long. - th.add_desc_chain(NetQueue::Rx, 1300, &[(5, 1526, VIRTQ_DESC_F_WRITE)]); + // Add valid descriptor chain. + th.add_desc_chain(NetQueue::Rx, 1300, &[(5, 1000, VIRTQ_DESC_F_WRITE)]); // Inject frame to tap and run epoll. - let mut frame = inject_tap_tx_frame(&th.net(), 1000); + let frame = inject_tap_tx_frame(&th.net(), 1000); check_metric_after_block!( th.net().metrics.rx_packets_count, 1, @@ -1297,11 +1231,10 @@ pub mod tests { th.rxq.check_used_elem(1, 3, 0); th.rxq.check_used_elem(2, 4, 0); // Check that the frame wasn't deferred. - assert!(th.net().rx_buffer.deferred_descriptor.is_none()); + assert!(!th.net().rx_deferred_frame); // Check that the frame has been written successfully to the valid Rx descriptor chain. th.rxq .check_used_elem(3, 5, frame.len().try_into().unwrap()); - header_set_num_buffers(frame.as_mut_slice(), 1); th.rxq.dtable[5].check_data(&frame); } @@ -1324,7 +1257,7 @@ pub mod tests { ], ); // Inject frame to tap and run epoll. - let mut frame = inject_tap_tx_frame(&th.net(), 1000); + let frame = inject_tap_tx_frame(&th.net(), 1000); check_metric_after_block!( th.net().metrics.rx_packets_count, 1, @@ -1332,12 +1265,11 @@ pub mod tests { ); // Check that the frame wasn't deferred. - assert!(th.net().rx_buffer.deferred_descriptor.is_none()); + assert!(!th.net().rx_deferred_frame); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 1); assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // Check that the frame has been written successfully to the Rx descriptor chain. - header_set_num_buffers(frame.as_mut_slice(), 1); th.rxq .check_used_elem(0, 3, frame.len().try_into().unwrap()); th.rxq.dtable[3].check_data(&frame[..100]); @@ -1356,24 +1288,16 @@ pub mod tests { th.add_desc_chain( NetQueue::Rx, 0, - &[ - (0, 500, VIRTQ_DESC_F_WRITE), - (1, 500, VIRTQ_DESC_F_WRITE), - (2, 526, VIRTQ_DESC_F_WRITE), - ], + &[(0, 500, VIRTQ_DESC_F_WRITE), (1, 500, VIRTQ_DESC_F_WRITE)], ); th.add_desc_chain( NetQueue::Rx, - 2000, - &[ - (3, 500, VIRTQ_DESC_F_WRITE), - (4, 500, VIRTQ_DESC_F_WRITE), - (5, 526, VIRTQ_DESC_F_WRITE), - ], + 1000, + &[(2, 500, VIRTQ_DESC_F_WRITE), (3, 500, VIRTQ_DESC_F_WRITE)], ); // Inject 2 frames to tap and run epoll. - let mut frame_1 = inject_tap_tx_frame(&th.net(), 200); - let mut frame_2 = inject_tap_tx_frame(&th.net(), 300); + let frame_1 = inject_tap_tx_frame(&th.net(), 200); + let frame_2 = inject_tap_tx_frame(&th.net(), 300); check_metric_after_block!( th.net().metrics.rx_packets_count, 2, @@ -1381,24 +1305,20 @@ pub mod tests { ); // Check that the frames weren't deferred. - assert!(th.net().rx_buffer.deferred_descriptor.is_none()); + assert!(!th.net().rx_deferred_frame); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 2); assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // Check that the 1st frame was written successfully to the 1st Rx descriptor chain. - header_set_num_buffers(frame_1.as_mut_slice(), 1); th.rxq .check_used_elem(0, 0, frame_1.len().try_into().unwrap()); th.rxq.dtable[0].check_data(&frame_1); th.rxq.dtable[1].check_data(&[0; 500]); - th.rxq.dtable[2].check_data(&[0; 526]); // Check that the 2nd frame was written successfully to the 2nd Rx descriptor chain. - header_set_num_buffers(frame_2.as_mut_slice(), 1); th.rxq - .check_used_elem(1, 3, frame_2.len().try_into().unwrap()); - th.rxq.dtable[3].check_data(&frame_2); - th.rxq.dtable[4].check_data(&[0; 500]); - th.rxq.dtable[2].check_data(&[0; 526]); + .check_used_elem(1, 2, frame_2.len().try_into().unwrap()); + th.rxq.dtable[2].check_data(&frame_2); + th.rxq.dtable[3].check_data(&[0; 500]); } #[test] @@ -1685,19 +1605,6 @@ pub mod tests { fn test_mmds_detour_and_injection() { let mut net = default_net(); - // Inject a fake buffer in the devices buffers, otherwise we won't be able to receive the - // MMDS frame. One iovec will be just fine. - let mut fake_buffer = vec![0u8; 1024]; - let iov_buffer = IoVecBufferMut::from(fake_buffer.as_mut_slice()); - net.rx_buffer.iovec = iov_buffer; - net.rx_buffer - .parsed_descriptors - .push_back(ParsedDescriptorChain { - head_index: 1, - length: 1024, - nr_iovecs: 1, - }); - let src_mac = MacAddr::from_str("11:11:11:11:11:11").unwrap(); let src_ip = Ipv4Addr::new(10, 1, 2, 3); let dst_mac = MacAddr::from_str("22:22:22:22:22:22").unwrap(); @@ -1814,12 +1721,8 @@ pub mod tests { // SAFETY: its a valid fd unsafe { libc::close(th.net.lock().unwrap().tap.as_raw_fd()) }; - // The RX queue is empty and there is a deferred frame. - th.net().rx_buffer.deferred_descriptor = Some(ParsedDescriptorChain { - head_index: 1, - length: 100, - nr_iovecs: 1, - }); + // The RX queue is empty and rx_deffered_frame is set. + th.net().rx_deferred_frame = true; check_metric_after_block!( th.net().metrics.no_rx_avail_buffer, 1, @@ -1829,9 +1732,10 @@ pub mod tests { // We need to set this here to false, otherwise the device will try to // handle a deferred frame, it will fail and will never try to read from // the tap. - th.net().rx_buffer.deferred_descriptor = None; + th.net().rx_deferred_frame = false; - th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); + // Fake an avail buffer; this time, tap reading should error out. + th.rxq.avail.idx.set(1); check_metric_after_block!( th.net().metrics.tap_read_fails, 1, @@ -1839,6 +1743,59 @@ pub mod tests { ); } + #[test] + fn test_deferred_frame() { + let mem = single_region_mem(2 * MAX_BUFFER_SIZE); + let mut th = TestHelper::get_default(&mem); + th.activate_net(); + + let rx_packets_count = th.net().metrics.rx_packets_count.count(); + let _ = inject_tap_tx_frame(&th.net(), 1000); + // Trigger a Tap event that. This should fail since there + // are not any available descriptors in the queue + check_metric_after_block!( + th.net().metrics.no_rx_avail_buffer, + 1, + th.simulate_event(NetEvent::Tap) + ); + // The frame we read from the tap should be deferred now and + // no frames should have been transmitted + assert!(th.net().rx_deferred_frame); + assert_eq!(th.net().metrics.rx_packets_count.count(), rx_packets_count); + + // Let's add a second frame, which should really have the same + // fate. + let _ = inject_tap_tx_frame(&th.net(), 1000); + + // Adding a descriptor in the queue. This should handle the first deferred + // frame. However, this should try to handle the second tap as well and fail + // since there's only one Descriptor Chain in the queue. + th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); + check_metric_after_block!( + th.net().metrics.no_rx_avail_buffer, + 1, + th.simulate_event(NetEvent::Tap) + ); + // We should still have a deferred frame + assert!(th.net().rx_deferred_frame); + // However, we should have delivered the first frame + assert_eq!( + th.net().metrics.rx_packets_count.count(), + rx_packets_count + 1 + ); + + // Let's add one more descriptor and try to handle the last frame as well. + th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); + check_metric_after_block!( + th.net().metrics.rx_packets_count, + 1, + th.simulate_event(NetEvent::RxQueue) + ); + + // We should be done with any deferred frame + assert!(!th.net().rx_deferred_frame); + } + #[test] fn test_rx_rate_limiter_handling() { let mem = single_region_mem(2 * MAX_BUFFER_SIZE); @@ -1951,10 +1908,10 @@ pub mod tests { let mut rl = RateLimiter::new(1000, 0, 500, 0, 0, 0).unwrap(); // set up RX - assert!(th.net().rx_buffer.deferred_descriptor.is_none()); + assert!(!th.net().rx_deferred_frame); th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); - let mut frame = inject_tap_tx_frame(&th.net(), 1000); + let frame = inject_tap_tx_frame(&th.net(), 1000); // use up the budget (do it after injecting the tx frame, as socket communication is // slow enough that the ratelimiter could replenish in the meantime). @@ -1971,7 +1928,7 @@ pub mod tests { // assert that limiter is blocked assert!(th.net().rx_rate_limiter.is_blocked()); assert_eq!(th.net().metrics.rx_rate_limiter_throttled.count(), 1); - assert!(th.net().rx_buffer.deferred_descriptor.is_some()); + assert!(th.net().rx_deferred_frame); // assert that no operation actually completed (limiter blocked it) assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // make sure the data is still queued for processing @@ -2006,7 +1963,6 @@ pub mod tests { assert_eq!(th.rxq.used.idx.get(), 1); th.rxq .check_used_elem(0, 0, frame.len().try_into().unwrap()); - header_set_num_buffers(frame.as_mut_slice(), 1); th.rxq.dtable[0].check_data(&frame); } } @@ -2070,9 +2026,9 @@ pub mod tests { let mut rl = RateLimiter::new(0, 0, 0, 1, 0, 500).unwrap(); // set up RX - assert!(th.net().rx_buffer.deferred_descriptor.is_none()); + assert!(!th.net().rx_deferred_frame); th.add_desc_chain(NetQueue::Rx, 0, &[(0, 4096, VIRTQ_DESC_F_WRITE)]); - let mut frame = inject_tap_tx_frame(&th.net(), 1234); + let frame = inject_tap_tx_frame(&th.net(), 1234); // use up the initial budget assert!(rl.consume(1, TokenType::Ops)); @@ -2092,7 +2048,7 @@ pub mod tests { // assert that limiter is blocked assert!(th.net().rx_rate_limiter.is_blocked()); assert!(th.net().metrics.rx_rate_limiter_throttled.count() >= 1); - assert!(th.net().rx_buffer.deferred_descriptor.is_some()); + assert!(th.net().rx_deferred_frame); // assert that no operation actually completed (limiter blocked it) assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); // make sure the data is still queued for processing @@ -2119,7 +2075,6 @@ pub mod tests { assert_eq!(th.rxq.used.idx.get(), 1); th.rxq .check_used_elem(0, 0, frame.len().try_into().unwrap()); - header_set_num_buffers(frame.as_mut_slice(), 1); th.rxq.dtable[0].check_data(&frame); } } diff --git a/src/vmm/src/devices/virtio/net/mod.rs b/src/vmm/src/devices/virtio/net/mod.rs index e8a3f86ac72..1a7972595ad 100644 --- a/src/vmm/src/devices/virtio/net/mod.rs +++ b/src/vmm/src/devices/virtio/net/mod.rs @@ -27,10 +27,8 @@ pub mod test_utils; mod gen; pub use tap::{Tap, TapError}; -use vm_memory::VolatileMemoryError; pub use self::device::Net; -use super::iovec::IoVecError; /// Enum representing the Net device queue types #[derive(Debug)] @@ -52,10 +50,6 @@ pub enum NetError { EventFd(io::Error), /// IO error: {0} IO(io::Error), - /// Error writing in guest memory: {0} - GuestMemoryError(#[from] VolatileMemoryError), /// The VNET header is missing from the frame VnetHeaderMissing, - /// IoVecBuffer(Mut) error: {0} - IoVecError(#[from] IoVecError), } diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index c7918f07ab3..4f0ae35d966 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -9,10 +9,9 @@ use std::sync::{Arc, Mutex}; use serde::{Deserialize, Serialize}; -use super::device::{Net, RxBuffers}; -use super::{TapError, NET_NUM_QUEUES, RX_INDEX}; +use super::device::Net; +use super::{TapError, NET_NUM_QUEUES}; use crate::devices::virtio::device::DeviceState; -use crate::devices::virtio::iovec::ParsedDescriptorChain; use crate::devices::virtio::persist::{PersistError as VirtioStateError, VirtioDeviceState}; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; use crate::devices::virtio::TYPE_NET; @@ -32,23 +31,6 @@ pub struct NetConfigSpaceState { guest_mac: Option, } -/// Information about the parsed RX buffers -#[derive(Debug, Default, Clone, Serialize, Deserialize)] -pub struct RxBufferState { - // Number of iovecs we have parsed from the guest - parsed_descriptor_chains_nr: u16, - deferred_descriptor: Option, -} - -impl RxBufferState { - fn from_rx_buffers(rx_buffer: &RxBuffers) -> Self { - RxBufferState { - parsed_descriptor_chains_nr: rx_buffer.parsed_descriptors.len().try_into().unwrap(), - deferred_descriptor: rx_buffer.deferred_descriptor.clone(), - } - } -} - /// Information about the network device that are saved /// at snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -61,7 +43,6 @@ pub struct NetState { pub mmds_ns: Option, config_space: NetConfigSpaceState, virtio_state: VirtioDeviceState, - rx_buffers_state: RxBufferState, } /// Auxiliary structure for creating a device when resuming from a snapshot. @@ -104,7 +85,6 @@ impl Persist<'_> for Net { guest_mac: self.guest_mac, }, virtio_state: VirtioDeviceState::from_device(self), - rx_buffers_state: RxBufferState::from_rx_buffers(&self.rx_buffer), } } @@ -157,14 +137,6 @@ impl Persist<'_> for Net { .map_err(NetPersistError::TapSetOffload)?; net.device_state = DeviceState::Activated(constructor_args.mem); - - // Recreate `Net::rx_buffer`. We do it by re-parsing the RX queue. We're temporarily - // rolling back `next_avail` in the RX queue and call `parse_rx_descriptors`. - net.queues[RX_INDEX].next_avail -= state.rx_buffers_state.parsed_descriptor_chains_nr; - net.parse_rx_descriptors(); - net.rx_buffer - .deferred_descriptor - .clone_from(&state.rx_buffers_state.deferred_descriptor); } Ok(net) diff --git a/src/vmm/src/devices/virtio/net/tap.rs b/src/vmm/src/devices/virtio/net/tap.rs index 4d1757edc8e..20024a1ae8e 100644 --- a/src/vmm/src/devices/virtio/net/tap.rs +++ b/src/vmm/src/devices/virtio/net/tap.rs @@ -7,7 +7,7 @@ use std::fmt::{self, Debug}; use std::fs::File; -use std::io::Error as IoError; +use std::io::{Error as IoError, Read}; use std::os::raw::*; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; @@ -190,19 +190,11 @@ impl Tap { } Ok(usize::try_from(ret).unwrap()) } +} - /// Read from tap to an `IoVecBufferMut` - pub(crate) fn read_iovec(&mut self, buffer: &mut [libc::iovec]) -> Result { - let iov = buffer.as_mut_ptr(); - let iovcnt = buffer.len().try_into().unwrap(); - - // SAFETY: `readv` is safe. Called with a valid tap fd, the iovec pointer and length - // is provide by the `IoVecBufferMut` implementation and we check the return value. - let ret = unsafe { libc::readv(self.tap_file.as_raw_fd(), iov, iovcnt) }; - if ret == -1 { - return Err(IoError::last_os_error()); - } - Ok(usize::try_from(ret).unwrap()) +impl Read for Tap { + fn read(&mut self, buf: &mut [u8]) -> Result { + self.tap_file.read(buf) } } @@ -219,7 +211,6 @@ pub mod tests { use std::os::unix::ffi::OsStrExt; use super::*; - use crate::devices::virtio::iovec::IoVecBufferMut; use crate::devices::virtio::net::gen; use crate::devices::virtio::net::test_utils::{enable, if_index, TapTrafficSimulator}; @@ -227,6 +218,7 @@ pub mod tests { const VNET_HDR_SIZE: usize = 10; const PAYLOAD_SIZE: usize = 512; + const PACKET_SIZE: usize = 1024; #[test] fn test_tap_name() { @@ -295,6 +287,23 @@ pub mod tests { assert_eq!(tap.as_raw_fd(), tap.tap_file.as_raw_fd()); } + #[test] + fn test_read() { + let mut tap = Tap::open_named("").unwrap(); + enable(&tap); + let tap_traffic_simulator = TapTrafficSimulator::new(if_index(&tap)); + + let packet = vmm_sys_util::rand::rand_alphanumerics(PAYLOAD_SIZE); + tap_traffic_simulator.push_tx_packet(packet.as_bytes()); + + let mut buf = [0u8; PACKET_SIZE]; + assert_eq!(tap.read(&mut buf).unwrap(), PAYLOAD_SIZE + VNET_HDR_SIZE); + assert_eq!( + &buf[VNET_HDR_SIZE..packet.len() + VNET_HDR_SIZE], + packet.as_bytes() + ); + } + #[test] fn test_write_iovec() { let mut tap = Tap::open_named("").unwrap(); @@ -330,26 +339,4 @@ pub mod tests { fragment3 ); } - - #[test] - fn test_read_iovec() { - let mut tap = Tap::open_named("").unwrap(); - enable(&tap); - let tap_traffic_simulator = TapTrafficSimulator::new(if_index(&tap)); - - let mut buff1 = vec![0; PAYLOAD_SIZE + VNET_HDR_SIZE]; - let mut buff2 = vec![0; 2 * PAYLOAD_SIZE]; - - let mut rx_buffers = IoVecBufferMut::from(vec![buff1.as_mut_slice(), buff2.as_mut_slice()]); - - let packet = vmm_sys_util::rand::rand_alphanumerics(2 * PAYLOAD_SIZE); - tap_traffic_simulator.push_tx_packet(packet.as_bytes()); - assert_eq!( - tap.read_iovec(rx_buffers.as_iovec_mut_slice()).unwrap(), - 2 * PAYLOAD_SIZE + VNET_HDR_SIZE - ); - assert_eq!(&buff1[VNET_HDR_SIZE..], &packet.as_bytes()[..PAYLOAD_SIZE]); - assert_eq!(&buff2[..PAYLOAD_SIZE], &packet.as_bytes()[PAYLOAD_SIZE..]); - assert_eq!(&buff2[PAYLOAD_SIZE..], &vec![0; PAYLOAD_SIZE]) - } } diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs index eb1c6f6e883..07808bbb44b 100644 --- a/src/vmm/src/devices/virtio/net/test_utils.rs +++ b/src/vmm/src/devices/virtio/net/test_utils.rs @@ -430,9 +430,8 @@ pub mod test { event_fd.write(1).unwrap(); } - /// Generate a tap frame of `frame_len` and check that it is not read and - /// the descriptor chain has been discarded - pub fn check_rx_discarded_buffer(&mut self, frame_len: usize) -> Vec { + /// Generate a tap frame of `frame_len` and check that it is deferred + pub fn check_rx_deferred_frame(&mut self, frame_len: usize) -> Vec { let used_idx = self.rxq.used.idx.get(); // Inject frame to tap and run epoll. @@ -442,6 +441,8 @@ pub mod test { 0, self.event_manager.run_with_timeout(100).unwrap() ); + // Check that the frame has been deferred. + assert!(self.net().rx_deferred_frame); // Check that the descriptor chain has been discarded. assert_eq!(self.rxq.used.idx.get(), used_idx + 1); assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring)); @@ -453,9 +454,16 @@ pub mod test { /// is eventually received by the guest pub fn check_rx_queue_resume(&mut self, expected_frame: &[u8]) { let used_idx = self.rxq.used.idx.get(); - // Add a valid Rx avail descriptor chain and run epoll. We do not negotiate any feature - // offloading so the buffers need to be at least 1526 bytes long. - self.add_desc_chain(NetQueue::Rx, 0, &[(0, 1526, VIRTQ_DESC_F_WRITE)]); + // Add a valid Rx avail descriptor chain and run epoll. + self.add_desc_chain( + NetQueue::Rx, + 0, + &[( + 0, + u32::try_from(expected_frame.len()).unwrap(), + VIRTQ_DESC_F_WRITE, + )], + ); check_metric_after_block!( self.net().metrics.rx_packets_count, 1, diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 6a73816947a..4bdea1c40f6 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -112,7 +112,7 @@ impl Entropy { return Ok(0); } - let mut rand_bytes = vec![0; iovec.len()]; + let mut rand_bytes = vec![0; iovec.len() as usize]; rand::fill(&mut rand_bytes).map_err(|err| { METRICS.host_rng_fails.inc(); err @@ -120,9 +120,7 @@ impl Entropy { // It is ok to unwrap here. We are writing `iovec.len()` bytes at offset 0. iovec.write_all_volatile_at(&rand_bytes, 0).unwrap(); - // It is ok to unwrap here. `iovec` contains only a single `DescriptorChain`, which means - // that its length fit in a u32. - Ok(u32::try_from(iovec.len()).unwrap()) + Ok(iovec.len()) } fn process_entropy_queue(&mut self) { @@ -147,7 +145,7 @@ impl Entropy { // Check for available rate limiting budget. // If not enough budget is available, leave the request descriptor in the queue // to handle once we do have budget. - if !Self::rate_limit_request(&mut self.rate_limiter, iovec.len() as u64) { + if !Self::rate_limit_request(&mut self.rate_limiter, u64::from(iovec.len())) { debug!("entropy: throttling entropy queue"); METRICS.entropy_rate_limiter_throttled.inc(); self.queues[RNG_QUEUE].undo_pop(); diff --git a/src/vmm/src/devices/virtio/vsock/mod.rs b/src/vmm/src/devices/virtio/vsock/mod.rs index 879097fef45..7fdc86aed2e 100644 --- a/src/vmm/src/devices/virtio/vsock/mod.rs +++ b/src/vmm/src/devices/virtio/vsock/mod.rs @@ -30,7 +30,6 @@ pub use self::defs::uapi::VIRTIO_ID_VSOCK as TYPE_VSOCK; pub use self::defs::VSOCK_DEV_ID; pub use self::device::Vsock; pub use self::unix::{VsockUnixBackend, VsockUnixBackendError}; -use super::iov_deque::IovDequeError; use crate::devices::virtio::iovec::IoVecError; use crate::devices::virtio::persist::PersistError as VirtioStateError; @@ -139,8 +138,6 @@ pub enum VsockError { VirtioState(VirtioStateError), /// Vsock uds backend error: {0} VsockUdsBackend(VsockUnixBackendError), - /// Underlying IovDeque error: {0} - IovDeque(IovDequeError), } impl From for VsockError { @@ -150,7 +147,6 @@ impl From for VsockError { IoVecError::ReadOnlyDescriptor => VsockError::UnwritableDescriptor, IoVecError::GuestMemory(err) => VsockError::GuestMemoryMmap(err), IoVecError::OverflowedDescriptor => VsockError::DescChainOverflow, - IoVecError::IovDeque(err) => VsockError::IovDeque(err), } } } diff --git a/src/vmm/src/devices/virtio/vsock/packet.rs b/src/vmm/src/devices/virtio/vsock/packet.rs index f43c586330c..d63dd41386e 100644 --- a/src/vmm/src/devices/virtio/vsock/packet.rs +++ b/src/vmm/src/devices/virtio/vsock/packet.rs @@ -172,10 +172,8 @@ impl VsockPacket { // are live at the same time, meaning this has exclusive ownership over the memory let buffer = unsafe { IoVecBufferMut::from_descriptor_chain(mem, chain)? }; - // It is ok to unwrap the conversion from usize to u32, because the `buffer` only contains - // a single `DescriptorChain`, so its length fits in a u32. - if (u32::try_from(buffer.len()).unwrap()) < VSOCK_PKT_HDR_SIZE { - return Err(VsockError::DescChainTooShortForHeader(buffer.len())); + if buffer.len() < VSOCK_PKT_HDR_SIZE { + return Err(VsockError::DescChainTooShortForHeader(buffer.len() as usize)); } Ok(Self { @@ -224,7 +222,7 @@ impl VsockPacket { pub fn buf_size(&self) -> u32 { let chain_length = match self.buffer { VsockPacketBuffer::Tx(ref iovec_buf) => iovec_buf.len(), - VsockPacketBuffer::Rx(ref iovec_buf) => u32::try_from(iovec_buf.len()).unwrap(), + VsockPacketBuffer::Rx(ref iovec_buf) => iovec_buf.len(), }; chain_length - VSOCK_PKT_HDR_SIZE } @@ -239,8 +237,8 @@ impl VsockPacket { VsockPacketBuffer::Tx(_) => Err(VsockError::UnwritableDescriptor), VsockPacketBuffer::Rx(ref mut buffer) => { if count - > u32::try_from(buffer.len()) - .unwrap() + > buffer + .len() .saturating_sub(VSOCK_PKT_HDR_SIZE) .saturating_sub(offset) {