Skip to content

Commit 0acf388

Browse files
committed
net: use writev when writing frames to tap
This allows us to avoid an extra user-space copy of the packet we are transmitting to the tap. We use directly the buffers described by the `DescriptorChain` by means of `IoVecBuffer`s. Signed-off-by: Babis Chalios <[email protected]>
1 parent 2586711 commit 0acf388

File tree

5 files changed

+129
-124
lines changed

5 files changed

+129
-124
lines changed

resources/seccomp/aarch64-unknown-linux-musl.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
{
2525
"syscall": "write"
2626
},
27+
{
28+
"syscall": "writev",
29+
"comment": "Used by the VirtIO net device to write to tap"
30+
},
2731
{
2832
"syscall": "fsync"
2933
},

resources/seccomp/x86_64-unknown-linux-musl.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
{
2525
"syscall": "write"
2626
},
27+
{
28+
"syscall": "writev",
29+
"comment": "Used by the VirtIO net device to write to tap"
30+
},
2731
{
2832
"syscall": "fsync"
2933
},

src/devices/src/virtio/net/device.rs

Lines changed: 85 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@
55
// Use of this source code is governed by a BSD-style license that can be
66
// found in the THIRD-PARTY file.
77

8+
use crate::virtio::net::iovec::IoVecBuffer;
89
use crate::virtio::net::tap::Tap;
910
#[cfg(test)]
1011
use crate::virtio::net::test_utils::Mocks;
1112
use crate::virtio::net::Error;
1213
use crate::virtio::net::NetQueue;
1314
use crate::virtio::net::Result;
14-
use crate::virtio::net::{MAX_BUFFER_SIZE, QUEUE_SIZE, QUEUE_SIZES, RX_INDEX, TX_INDEX};
15+
use crate::virtio::net::{MAX_BUFFER_SIZE, QUEUE_SIZES, RX_INDEX, TX_INDEX};
1516
use crate::virtio::DescriptorChain;
1617
use crate::virtio::{
1718
ActivateResult, DeviceState, IrqTrigger, IrqType, Queue, VirtioDevice, TYPE_NET,
@@ -39,7 +40,12 @@ use virtio_gen::virtio_net::{
3940
VIRTIO_NET_F_MAC,
4041
};
4142
use virtio_gen::virtio_ring::VIRTIO_RING_F_EVENT_IDX;
42-
use vm_memory::{ByteValued, Bytes, GuestAddress, GuestMemoryError, GuestMemoryMmap};
43+
use vm_memory::{ByteValued, Bytes, GuestMemoryError, GuestMemoryMmap};
44+
45+
use dumbo::pdu::arp::ETH_IPV4_FRAME_LEN;
46+
use dumbo::pdu::ethernet::PAYLOAD_OFFSET;
47+
48+
const FRAME_HEADER_MAX_LEN: usize = PAYLOAD_OFFSET + ETH_IPV4_FRAME_LEN;
4349

4450
enum FrontendError {
4551
AddUsed,
@@ -53,6 +59,13 @@ pub(crate) fn vnet_hdr_len() -> usize {
5359
mem::size_of::<virtio_net_hdr_v1>()
5460
}
5561

62+
// This returns the maximum frame header length. This includes the VNET header plus
63+
// the maximum L2 frame header bytes which includes the ethernet frame header plus
64+
// the header IPv4 ARP header which is 28 bytes long.
65+
fn frame_hdr_len() -> usize {
66+
vnet_hdr_len() + FRAME_HEADER_MAX_LEN
67+
}
68+
5669
// Frames being sent/received through the network device model have a VNET header. This
5770
// function returns a slice which holds the L2 frame bytes without this header.
5871
fn frame_bytes_from_buf(buf: &[u8]) -> Result<&[u8]> {
@@ -114,7 +127,6 @@ pub struct Net {
114127
rx_bytes_read: usize,
115128
rx_frame_buf: [u8; MAX_BUFFER_SIZE],
116129

117-
tx_iovec: Vec<(GuestAddress, usize)>,
118130
tx_frame_buf: [u8; MAX_BUFFER_SIZE],
119131

120132
pub(crate) irq_trigger: IrqTrigger,
@@ -189,7 +201,6 @@ impl Net {
189201
rx_bytes_read: 0,
190202
rx_frame_buf: [0u8; MAX_BUFFER_SIZE],
191203
tx_frame_buf: [0u8; MAX_BUFFER_SIZE],
192-
tx_iovec: Vec::with_capacity(QUEUE_SIZE as usize),
193204
irq_trigger: IrqTrigger::new().map_err(Error::EventFd)?,
194205
device_state: DeviceState::Inactive,
195206
activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(Error::EventFd)?,
@@ -266,24 +277,31 @@ impl Net {
266277
Ok(())
267278
}
268279

280+
// Helper function to consume one op with `size` bytes from a rate limiter
281+
fn rate_limiter_consume_op(rate_limiter: &mut RateLimiter, size: u64) -> bool {
282+
if !rate_limiter.consume(1, TokenType::Ops) {
283+
return false;
284+
}
285+
286+
if !rate_limiter.consume(size, TokenType::Bytes) {
287+
rate_limiter.manual_replenish(1, TokenType::Ops);
288+
return false;
289+
}
290+
291+
true
292+
}
293+
294+
// Helper function to replenish one operation with `size` bytes from a rate limiter
295+
fn rate_limiter_replenish_op(rate_limiter: &mut RateLimiter, size: u64) {
296+
rate_limiter.manual_replenish(1, TokenType::Ops);
297+
rate_limiter.manual_replenish(size, TokenType::Bytes);
298+
}
299+
269300
// Attempts to copy a single frame into the guest if there is enough
270301
// rate limiting budget.
271302
// Returns true on successful frame delivery.
272303
fn rate_limited_rx_single_frame(&mut self) -> bool {
273-
// If limiter.consume() fails it means there is no more TokenType::Ops
274-
// budget and rate limiting is in effect.
275-
if !self.rx_rate_limiter.consume(1, TokenType::Ops) {
276-
METRICS.net.rx_rate_limiter_throttled.inc();
277-
return false;
278-
}
279-
// If limiter.consume() fails it means there is no more TokenType::Bytes
280-
// budget and rate limiting is in effect.
281-
if !self
282-
.rx_rate_limiter
283-
.consume(self.rx_bytes_read as u64, TokenType::Bytes)
284-
{
285-
// revert the OPS consume()
286-
self.rx_rate_limiter.manual_replenish(1, TokenType::Ops);
304+
if !Self::rate_limiter_consume_op(&mut self.rx_rate_limiter, self.rx_bytes_read as u64) {
287305
METRICS.net.rx_rate_limiter_throttled.inc();
288306
return false;
289307
}
@@ -293,12 +311,10 @@ impl Net {
293311

294312
// Undo the tokens consumption if guest delivery failed.
295313
if !success {
296-
// revert the OPS consume()
297-
self.rx_rate_limiter.manual_replenish(1, TokenType::Ops);
298-
// revert the BYTES consume()
299-
self.rx_rate_limiter
300-
.manual_replenish(self.rx_bytes_read as u64, TokenType::Bytes);
314+
// revert the rate limiting budget consumption
315+
Self::rate_limiter_replenish_op(&mut self.rx_rate_limiter, self.rx_bytes_read as u64);
301316
}
317+
302318
success
303319
}
304320

@@ -409,24 +425,28 @@ impl Net {
409425
fn write_to_mmds_or_tap(
410426
mmds_ns: Option<&mut MmdsNetworkStack>,
411427
rate_limiter: &mut RateLimiter,
412-
frame_buf: &[u8],
428+
frame_buf: &mut [u8],
429+
frame_iovec: &mut IoVecBuffer,
413430
tap: &mut Tap,
414431
guest_mac: Option<MacAddr>,
415432
) -> Result<bool> {
416-
let checked_frame = |frame_buf| {
417-
frame_bytes_from_buf(frame_buf).map_err(|e| {
418-
error!("VNET header missing in the TX frame.");
419-
METRICS.net.tx_malformed_frames.inc();
420-
e
421-
})
422-
};
433+
let frame_header = frame_bytes_from_buf(frame_buf).map_err(|e| {
434+
error!("VNET header missing in the TX frame.");
435+
METRICS.net.tx_malformed_frames.inc();
436+
e
437+
})?;
438+
423439
if let Some(ns) = mmds_ns {
424-
if ns.detour_frame(checked_frame(frame_buf)?) {
440+
if ns.is_mmds_frame(frame_header) {
441+
// Here it is ok to unwrap. We have already checked that the IoVecBuffer holds
442+
// at least the frame headers and we are reading from the beginning of
443+
// the IoVecBuffer.
444+
let frame_len = frame_iovec.read_at(frame_buf, 0).unwrap();
445+
let _ = ns.detour_frame(&frame_buf[..frame_len]);
425446
METRICS.mmds.rx_accepted.inc();
426447

427448
// MMDS frames are not accounted by the rate limiter.
428-
rate_limiter.manual_replenish(frame_buf.len() as u64, TokenType::Bytes);
429-
rate_limiter.manual_replenish(1, TokenType::Ops);
449+
Self::rate_limiter_replenish_op(rate_limiter, frame_iovec.len() as u64);
430450

431451
// MMDS consumed the frame.
432452
return Ok(true);
@@ -437,16 +457,16 @@ impl Net {
437457

438458
// Check for guest MAC spoofing.
439459
if let Some(mac) = guest_mac {
440-
let _ = EthernetFrame::from_bytes(checked_frame(frame_buf)?).map(|eth_frame| {
460+
let _ = EthernetFrame::from_bytes(frame_header).map(|eth_frame| {
441461
if mac != eth_frame.src_mac() {
442462
METRICS.net.tx_spoofed_mac_count.inc();
443463
}
444464
});
445465
}
446466

447-
match tap.write(frame_buf) {
467+
match tap.write_vectored(frame_iovec) {
448468
Ok(_) => {
449-
METRICS.net.tx_bytes_count.add(frame_buf.len());
469+
METRICS.net.tx_bytes_count.add(frame_iovec.len());
450470
METRICS.net.tx_packets_count.inc();
451471
METRICS.net.tx_count.inc();
452472
}
@@ -544,79 +564,29 @@ impl Net {
544564
let tx_queue = &mut self.queues[TX_INDEX];
545565

546566
while let Some(head) = tx_queue.pop_or_enable_notification(mem) {
547-
// If limiter.consume() fails it means there is no more TokenType::Ops
548-
// budget and rate limiting is in effect.
549-
if !self.tx_rate_limiter.consume(1, TokenType::Ops) {
550-
// Stop processing the queue and return this descriptor chain to the
551-
// avail ring, for later processing.
552-
tx_queue.undo_pop();
553-
METRICS.net.tx_rate_limiter_throttled.inc();
554-
break;
555-
}
556-
557567
let head_index = head.index;
558-
let mut read_count = 0;
559-
let mut next_desc = Some(head);
560-
561-
self.tx_iovec.clear();
562-
while let Some(desc) = next_desc {
563-
if desc.is_write_only() {
564-
self.tx_iovec.clear();
565-
break;
566-
}
567-
self.tx_iovec.push((desc.addr, desc.len as usize));
568-
read_count += desc.len as usize;
569-
next_desc = desc.next_descriptor();
570-
}
571-
572-
// If limiter.consume() fails it means there is no more TokenType::Bytes
573-
// budget and rate limiting is in effect.
574-
if !self
575-
.tx_rate_limiter
576-
.consume(read_count as u64, TokenType::Bytes)
577-
{
578-
// revert the OPS consume()
579-
self.tx_rate_limiter.manual_replenish(1, TokenType::Ops);
580-
// Stop processing the queue and return this descriptor chain to the
581-
// avail ring, for later processing.
568+
// Parse IoVecBuffer from descriptor head
569+
let mut buffer = IoVecBuffer::from_descriptor_chain(mem, head).unwrap();
570+
if !Self::rate_limiter_consume_op(&mut self.tx_rate_limiter, buffer.len() as u64) {
582571
tx_queue.undo_pop();
583572
METRICS.net.tx_rate_limiter_throttled.inc();
584573
break;
585574
}
586575

587-
read_count = 0;
588-
// Copy buffer from across multiple descriptors.
589-
// TODO(performance - Issue #420): change this to use `writev()` instead of `write()`
590-
// and get rid of the intermediate buffer.
591-
for (desc_addr, desc_len) in self.tx_iovec.drain(..) {
592-
let limit = cmp::min((read_count + desc_len) as usize, self.tx_frame_buf.len());
593-
594-
let read_result = mem.read_slice(
595-
&mut self.tx_frame_buf[read_count..limit as usize],
596-
desc_addr,
597-
);
598-
match read_result {
599-
Ok(()) => {
600-
read_count += limit - read_count;
601-
METRICS.net.tx_count.inc();
602-
}
603-
Err(e) => {
604-
error!("Failed to read slice: {:?}", e);
605-
match e {
606-
GuestMemoryError::PartialBuffer { .. } => &METRICS.net.tx_partial_reads,
607-
_ => &METRICS.net.tx_fails,
608-
}
609-
.inc();
610-
read_count = 0;
611-
break;
612-
}
613-
}
614-
}
576+
// Read the frame headers from the IoVecBuffer
577+
buffer
578+
.read_at(&mut self.tx_frame_buf[..frame_hdr_len()], 0)
579+
.ok_or_else(|| {
580+
error!("Frame headers missing in the TX frame.");
581+
METRICS.net.tx_malformed_frames.inc();
582+
DeviceError::MalformedPayload
583+
})?;
615584

616585
let frame_consumed_by_mmds = Self::write_to_mmds_or_tap(
617586
self.mmds_ns.as_mut(),
618587
&mut self.tx_rate_limiter,
619-
&self.tx_frame_buf[..read_count],
588+
&mut self.tx_frame_buf,
589+
&mut buffer,
620590
&mut self.tap,
621591
self.guest_mac,
622592
)
@@ -905,13 +875,13 @@ pub mod tests {
905875

906876
#[test]
907877
fn test_vnet_helpers() {
908-
let mut frame_buf = vec![42u8; vnet_hdr_len() - 1];
878+
let frame_buf = vec![42u8; vnet_hdr_len() - 1];
909879
assert_eq!(
910880
format!("{:?}", frame_bytes_from_buf(&frame_buf)),
911881
"Err(VnetHeaderMissing)"
912882
);
913883
assert_eq!(
914-
format!("{:?}", frame_bytes_from_buf_mut(&mut frame_buf)),
884+
format!("{:?}", frame_bytes_from_buf_mut(&mut frame_buf.clone())),
915885
"Err(VnetHeaderMissing)"
916886
);
917887

@@ -1456,6 +1426,10 @@ pub mod tests {
14561426
let dst_ip = Ipv4Addr::new(169, 254, 169, 254);
14571427

14581428
let (frame_buf, frame_len) = create_arp_request(src_mac, src_ip, dst_mac, dst_ip);
1429+
let mut buffer = bufferBuffer::from(&frame_buf[..frame_len]);
1430+
1431+
let mut headers = vec![0; frame_hdr_len()];
1432+
buffer.read_at(&mut headers, 0).unwrap();
14591433

14601434
// Call the code which sends the packet to the host or MMDS.
14611435
// Validate the frame was consumed by MMDS and that the metrics reflect that.
@@ -1465,7 +1439,8 @@ pub mod tests {
14651439
assert!(Net::write_to_mmds_or_tap(
14661440
net.mmds_ns.as_mut(),
14671441
&mut net.tx_rate_limiter,
1468-
&frame_buf[..frame_len],
1442+
&mut headers,
1443+
&mut buffer,
14691444
&mut net.tap,
14701445
Some(src_mac),
14711446
)
@@ -1491,6 +1466,9 @@ pub mod tests {
14911466
let dst_ip = Ipv4Addr::new(10, 1, 1, 1);
14921467

14931468
let (frame_buf, frame_len) = create_arp_request(guest_mac, guest_ip, dst_mac, dst_ip);
1469+
let mut buffer = bufferBuffer::from(&frame_buf[..frame_len]);
1470+
let mut headers = vec![0; frame_hdr_len()];
1471+
buffer.read_at(&mut headers, 0).unwrap();
14941472

14951473
// Check that a legit MAC doesn't affect the spoofed MAC metric.
14961474
check_metric_after_block!(
@@ -1499,7 +1477,8 @@ pub mod tests {
14991477
Net::write_to_mmds_or_tap(
15001478
net.mmds_ns.as_mut(),
15011479
&mut net.tx_rate_limiter,
1502-
&frame_buf[..frame_len],
1480+
&mut headers,
1481+
&mut buffer,
15031482
&mut net.tap,
15041483
Some(guest_mac),
15051484
)
@@ -1512,7 +1491,8 @@ pub mod tests {
15121491
Net::write_to_mmds_or_tap(
15131492
net.mmds_ns.as_mut(),
15141493
&mut net.tx_rate_limiter,
1515-
&frame_buf[..frame_len],
1494+
&mut headers,
1495+
&mut buffer,
15161496
&mut net.tap,
15171497
Some(not_guest_mac),
15181498
)

src/devices/src/virtio/net/tap.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
use net_gen::ifreq;
99
use std::fs::File;
10-
use std::io::{Error as IoError, Read, Result as IoResult, Write};
10+
use std::io::{Error as IoError, IoSlice, Read, Result as IoResult, Write};
1111
use std::os::raw::*;
1212
use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
1313
use utils::ioctl::{ioctl_with_mut_ref, ioctl_with_ref, ioctl_with_val};
@@ -176,6 +176,10 @@ impl Write for Tap {
176176
self.tap_file.write(&buf)
177177
}
178178

179+
fn write_vectored(&mut self, bufs: &[IoSlice<'_>]) -> IoResult<usize> {
180+
self.tap_file.write_vectored(bufs)
181+
}
182+
179183
fn flush(&mut self) -> IoResult<()> {
180184
Ok(())
181185
}

0 commit comments

Comments
 (0)