Skip to content

Commit 497dbb2

Browse files
wdebruijdavem330
authored andcommitted
gve: Add optional metadata descriptor type GVE_TXD_MTD
Allow drivers to pass metadata along with packet data to the device. Introduce a new metadata descriptor type * GVE_TXD_MTD This descriptor is optional. If present it immediate follows the packet descriptor and precedes the segment descriptor. This descriptor may be repeated. Multiple metadata descriptors may follow. There are no immediate uses for this, this is for future proofing. At present devices allow only 1 MTD descriptor. The lower four bits of the type_flags field encode GVE_TXD_MTD. The upper four bits of the type_flags field encodes a *sub*type. Introduce one such metadata descriptor subtype * GVE_MTD_SUBTYPE_PATH This shares path information with the device for network failure discovery and robust response: Linux derives ipv6 flowlabel and ECMP multipath from sk->sk_txhash, and updates this field on error with sk_rethink_txhash. Allow the host stack to do the same. Pass the tx_hash value if set. Also communicate whether the path hash is set, or more exactly, what its type is. Define two common types GVE_MTD_PATH_HASH_NONE GVE_MTD_PATH_HASH_L4 Concrete examples of error conditions that are resolved are mentioned in the commits that add sk_rethink_txhash calls. Such as commit 7788174 ("tcp: change IPv6 flow-label upon receiving spurious retransmission"). Experimental results mirror what the theory suggests: where IPv6 FlowLabel is included in path selection (e.g., LAG/ECMP), flowlabel rotation on TCP timeout avoids the vast majority of TCP disconnects that would otherwise have occurred during link failures in long-haul backbones, when an alternative path is available. Rotation can be applied to various bad connection signals, such as timeouts and spurious retransmissions. In aggregate, such flow level signals can help locate network issues. Define initial common states: GVE_MTD_PATH_STATE_DEFAULT GVE_MTD_PATH_STATE_TIMEOUT GVE_MTD_PATH_STATE_CONGESTION GVE_MTD_PATH_STATE_RETRANSMIT Signed-off-by: Willem de Bruijn <[email protected]> Signed-off-by: David Awogbemila <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 5fd07df commit 497dbb2

File tree

3 files changed

+74
-20
lines changed

3 files changed

+74
-20
lines changed

drivers/net/ethernet/google/gve/gve.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ struct gve_rx_ring {
229229
/* A TX desc ring entry */
230230
union gve_tx_desc {
231231
struct gve_tx_pkt_desc pkt; /* first desc for a packet */
232+
struct gve_tx_mtd_desc mtd; /* optional metadata descriptor */
232233
struct gve_tx_seg_desc seg; /* subsequent descs for a packet */
233234
};
234235

drivers/net/ethernet/google/gve/gve_desc.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,14 @@ struct gve_tx_pkt_desc {
3333
__be64 seg_addr; /* Base address (see note) of this segment */
3434
} __packed;
3535

36+
struct gve_tx_mtd_desc {
37+
u8 type_flags; /* type is lower 4 bits, subtype upper */
38+
u8 path_state; /* state is lower 4 bits, hash type upper */
39+
__be16 reserved0;
40+
__be32 path_hash;
41+
__be64 reserved1;
42+
} __packed;
43+
3644
struct gve_tx_seg_desc {
3745
u8 type_flags; /* type is lower 4 bits, flags upper */
3846
u8 l3_offset; /* TSO: 2 byte units to start of IPH */
@@ -46,6 +54,7 @@ struct gve_tx_seg_desc {
4654
#define GVE_TXD_STD (0x0 << 4) /* Std with Host Address */
4755
#define GVE_TXD_TSO (0x1 << 4) /* TSO with Host Address */
4856
#define GVE_TXD_SEG (0x2 << 4) /* Seg with Host Address */
57+
#define GVE_TXD_MTD (0x3 << 4) /* Metadata */
4958

5059
/* GVE Transmit Descriptor Flags for Std Pkts */
5160
#define GVE_TXF_L4CSUM BIT(0) /* Need csum offload */
@@ -54,6 +63,17 @@ struct gve_tx_seg_desc {
5463
/* GVE Transmit Descriptor Flags for TSO Segs */
5564
#define GVE_TXSF_IPV6 BIT(1) /* IPv6 TSO */
5665

66+
/* GVE Transmit Descriptor Options for MTD Segs */
67+
#define GVE_MTD_SUBTYPE_PATH 0
68+
69+
#define GVE_MTD_PATH_STATE_DEFAULT 0
70+
#define GVE_MTD_PATH_STATE_TIMEOUT 1
71+
#define GVE_MTD_PATH_STATE_CONGESTION 2
72+
#define GVE_MTD_PATH_STATE_RETRANSMIT 3
73+
74+
#define GVE_MTD_PATH_HASH_NONE (0x0 << 4)
75+
#define GVE_MTD_PATH_HASH_L4 (0x1 << 4)
76+
5777
/* GVE Receive Packet Descriptor */
5878
/* The start of an ethernet packet comes 2 bytes into the rx buffer.
5979
* gVNIC adds this padding so that both the DMA and the L3/4 protocol header

drivers/net/ethernet/google/gve/gve_tx.c

Lines changed: 53 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -296,11 +296,14 @@ static inline int gve_skb_fifo_bytes_required(struct gve_tx_ring *tx,
296296
return bytes;
297297
}
298298

299-
/* The most descriptors we could need is MAX_SKB_FRAGS + 3 : 1 for each skb frag,
300-
* +1 for the skb linear portion, +1 for when tcp hdr needs to be in separate descriptor,
301-
* and +1 if the payload wraps to the beginning of the FIFO.
299+
/* The most descriptors we could need is MAX_SKB_FRAGS + 4 :
300+
* 1 for each skb frag
301+
* 1 for the skb linear portion
302+
* 1 for when tcp hdr needs to be in separate descriptor
303+
* 1 if the payload wraps to the beginning of the FIFO
304+
* 1 for metadata descriptor
302305
*/
303-
#define MAX_TX_DESC_NEEDED (MAX_SKB_FRAGS + 3)
306+
#define MAX_TX_DESC_NEEDED (MAX_SKB_FRAGS + 4)
304307
static void gve_tx_unmap_buf(struct device *dev, struct gve_tx_buffer_state *info)
305308
{
306309
if (info->skb) {
@@ -395,6 +398,19 @@ static void gve_tx_fill_pkt_desc(union gve_tx_desc *pkt_desc,
395398
pkt_desc->pkt.seg_addr = cpu_to_be64(addr);
396399
}
397400

401+
static void gve_tx_fill_mtd_desc(union gve_tx_desc *mtd_desc,
402+
struct sk_buff *skb)
403+
{
404+
BUILD_BUG_ON(sizeof(mtd_desc->mtd) != sizeof(mtd_desc->pkt));
405+
406+
mtd_desc->mtd.type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH;
407+
mtd_desc->mtd.path_state = GVE_MTD_PATH_STATE_DEFAULT |
408+
GVE_MTD_PATH_HASH_L4;
409+
mtd_desc->mtd.path_hash = cpu_to_be32(skb->hash);
410+
mtd_desc->mtd.reserved0 = 0;
411+
mtd_desc->mtd.reserved1 = 0;
412+
}
413+
398414
static void gve_tx_fill_seg_desc(union gve_tx_desc *seg_desc,
399415
struct sk_buff *skb, bool is_gso,
400416
u16 len, u64 addr)
@@ -426,6 +442,7 @@ static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, st
426442
int pad_bytes, hlen, hdr_nfrags, payload_nfrags, l4_hdr_offset;
427443
union gve_tx_desc *pkt_desc, *seg_desc;
428444
struct gve_tx_buffer_state *info;
445+
int mtd_desc_nr = !!skb->l4_hash;
429446
bool is_gso = skb_is_gso(skb);
430447
u32 idx = tx->req & tx->mask;
431448
int payload_iov = 2;
@@ -457,7 +474,7 @@ static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, st
457474
&info->iov[payload_iov]);
458475

459476
gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
460-
1 + payload_nfrags, hlen,
477+
1 + mtd_desc_nr + payload_nfrags, hlen,
461478
info->iov[hdr_nfrags - 1].iov_offset);
462479

463480
skb_copy_bits(skb, 0,
@@ -468,8 +485,13 @@ static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, st
468485
info->iov[hdr_nfrags - 1].iov_len);
469486
copy_offset = hlen;
470487

488+
if (mtd_desc_nr) {
489+
next_idx = (tx->req + 1) & tx->mask;
490+
gve_tx_fill_mtd_desc(&tx->desc[next_idx], skb);
491+
}
492+
471493
for (i = payload_iov; i < payload_nfrags + payload_iov; i++) {
472-
next_idx = (tx->req + 1 + i - payload_iov) & tx->mask;
494+
next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask;
473495
seg_desc = &tx->desc[next_idx];
474496

475497
gve_tx_fill_seg_desc(seg_desc, skb, is_gso,
@@ -485,16 +507,17 @@ static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, st
485507
copy_offset += info->iov[i].iov_len;
486508
}
487509

488-
return 1 + payload_nfrags;
510+
return 1 + mtd_desc_nr + payload_nfrags;
489511
}
490512

491513
static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx,
492514
struct sk_buff *skb)
493515
{
494516
const struct skb_shared_info *shinfo = skb_shinfo(skb);
495-
int hlen, payload_nfrags, l4_hdr_offset;
496-
union gve_tx_desc *pkt_desc, *seg_desc;
517+
int hlen, num_descriptors, l4_hdr_offset;
518+
union gve_tx_desc *pkt_desc, *mtd_desc, *seg_desc;
497519
struct gve_tx_buffer_state *info;
520+
int mtd_desc_nr = !!skb->l4_hash;
498521
bool is_gso = skb_is_gso(skb);
499522
u32 idx = tx->req & tx->mask;
500523
u64 addr;
@@ -523,23 +546,30 @@ static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx,
523546
dma_unmap_len_set(info, len, len);
524547
dma_unmap_addr_set(info, dma, addr);
525548

526-
payload_nfrags = shinfo->nr_frags;
549+
num_descriptors = 1 + shinfo->nr_frags;
550+
if (hlen < len)
551+
num_descriptors++;
552+
if (mtd_desc_nr)
553+
num_descriptors++;
554+
555+
gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
556+
num_descriptors, hlen, addr);
557+
558+
if (mtd_desc_nr) {
559+
idx = (idx + 1) & tx->mask;
560+
mtd_desc = &tx->desc[idx];
561+
gve_tx_fill_mtd_desc(mtd_desc, skb);
562+
}
563+
527564
if (hlen < len) {
528565
/* For gso the rest of the linear portion of the skb needs to
529566
* be in its own descriptor.
530567
*/
531-
payload_nfrags++;
532-
gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
533-
1 + payload_nfrags, hlen, addr);
534-
535568
len -= hlen;
536569
addr += hlen;
537-
idx = (tx->req + 1) & tx->mask;
570+
idx = (idx + 1) & tx->mask;
538571
seg_desc = &tx->desc[idx];
539572
gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
540-
} else {
541-
gve_tx_fill_pkt_desc(pkt_desc, skb, is_gso, l4_hdr_offset,
542-
1 + payload_nfrags, hlen, addr);
543573
}
544574

545575
for (i = 0; i < shinfo->nr_frags; i++) {
@@ -560,11 +590,14 @@ static int gve_tx_add_skb_no_copy(struct gve_priv *priv, struct gve_tx_ring *tx,
560590
gve_tx_fill_seg_desc(seg_desc, skb, is_gso, len, addr);
561591
}
562592

563-
return 1 + payload_nfrags;
593+
return num_descriptors;
564594

565595
unmap_drop:
566-
i += (payload_nfrags == shinfo->nr_frags ? 1 : 2);
596+
i += num_descriptors - shinfo->nr_frags;
567597
while (i--) {
598+
/* Skip metadata descriptor, if set */
599+
if (i == 1 && mtd_desc_nr == 1)
600+
continue;
568601
idx--;
569602
gve_tx_unmap_buf(tx->dev, &tx->info[idx & tx->mask]);
570603
}

0 commit comments

Comments
 (0)