Skip to content

Commit 8e845f4

Browse files
David L Stevensdavem330
authored andcommitted
sunvnet: make transmit path zero-copy in the kernel
This patch removes pre-allocated transmit buffers and instead directly maps pending packets on demand. This saves O(n^2) maximum-sized transmit buffers, for n hosts on a vswitch, as well as a copy to those buffers. Single-stream TCP throughput linux-solaris dropped ~5% for 1500-byte MTU, but linux-linux at 1500-bytes increased ~20%. Signed-off-by: David L Stevens <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent e4defc7 commit 8e845f4

File tree

2 files changed

+182
-45
lines changed

2 files changed

+182
-45
lines changed

drivers/net/ethernet/sun/sunvnet.c

Lines changed: 174 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -780,6 +780,117 @@ struct vnet_port *tx_port_find(struct vnet *vp, struct sk_buff *skb)
780780
return ret;
781781
}
782782

783+
static struct sk_buff *vnet_clean_tx_ring(struct vnet_port *port,
784+
unsigned *pending)
785+
{
786+
struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
787+
struct sk_buff *skb = NULL;
788+
int i, txi;
789+
790+
*pending = 0;
791+
792+
txi = dr->prod-1;
793+
if (txi < 0)
794+
txi = VNET_TX_RING_SIZE-1;
795+
796+
for (i = 0; i < VNET_TX_RING_SIZE; ++i) {
797+
struct vio_net_desc *d;
798+
799+
d = vio_dring_entry(dr, txi);
800+
801+
if (d->hdr.state == VIO_DESC_DONE) {
802+
if (port->tx_bufs[txi].skb) {
803+
BUG_ON(port->tx_bufs[txi].skb->next);
804+
805+
port->tx_bufs[txi].skb->next = skb;
806+
skb = port->tx_bufs[txi].skb;
807+
port->tx_bufs[txi].skb = NULL;
808+
809+
ldc_unmap(port->vio.lp,
810+
port->tx_bufs[txi].cookies,
811+
port->tx_bufs[txi].ncookies);
812+
}
813+
d->hdr.state = VIO_DESC_FREE;
814+
} else if (d->hdr.state == VIO_DESC_READY) {
815+
(*pending)++;
816+
} else if (d->hdr.state == VIO_DESC_FREE) {
817+
break;
818+
}
819+
--txi;
820+
if (txi < 0)
821+
txi = VNET_TX_RING_SIZE-1;
822+
}
823+
return skb;
824+
}
825+
826+
static inline void vnet_free_skbs(struct sk_buff *skb)
827+
{
828+
struct sk_buff *next;
829+
830+
while (skb) {
831+
next = skb->next;
832+
skb->next = NULL;
833+
dev_kfree_skb(skb);
834+
skb = next;
835+
}
836+
}
837+
838+
static void vnet_clean_timer_expire(unsigned long port0)
839+
{
840+
struct vnet_port *port = (struct vnet_port *)port0;
841+
struct sk_buff *freeskbs;
842+
unsigned pending;
843+
unsigned long flags;
844+
845+
spin_lock_irqsave(&port->vio.lock, flags);
846+
freeskbs = vnet_clean_tx_ring(port, &pending);
847+
spin_unlock_irqrestore(&port->vio.lock, flags);
848+
849+
vnet_free_skbs(freeskbs);
850+
851+
if (pending)
852+
(void)mod_timer(&port->clean_timer,
853+
jiffies + VNET_CLEAN_TIMEOUT);
854+
else
855+
del_timer(&port->clean_timer);
856+
}
857+
858+
static inline struct sk_buff *vnet_skb_shape(struct sk_buff *skb, void **pstart,
859+
int *plen)
860+
{
861+
struct sk_buff *nskb;
862+
int len, pad;
863+
864+
len = skb->len;
865+
pad = 0;
866+
if (len < ETH_ZLEN) {
867+
pad += ETH_ZLEN - skb->len;
868+
len += pad;
869+
}
870+
len += VNET_PACKET_SKIP;
871+
pad += 8 - (len & 7);
872+
len += 8 - (len & 7);
873+
874+
if (((unsigned long)skb->data & 7) != VNET_PACKET_SKIP ||
875+
skb_tailroom(skb) < pad ||
876+
skb_headroom(skb) < VNET_PACKET_SKIP) {
877+
nskb = alloc_and_align_skb(skb->dev, skb->len);
878+
skb_reserve(nskb, VNET_PACKET_SKIP);
879+
if (skb_copy_bits(skb, 0, nskb->data, skb->len)) {
880+
dev_kfree_skb(nskb);
881+
dev_kfree_skb(skb);
882+
return NULL;
883+
}
884+
(void)skb_put(nskb, skb->len);
885+
dev_kfree_skb(skb);
886+
skb = nskb;
887+
}
888+
889+
*pstart = skb->data - VNET_PACKET_SKIP;
890+
*plen = len;
891+
return skb;
892+
}
893+
783894
static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
784895
{
785896
struct vnet *vp = netdev_priv(dev);
@@ -788,12 +899,20 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
788899
struct vio_net_desc *d;
789900
unsigned long flags;
790901
unsigned int len;
791-
void *tx_buf;
792-
int i, err;
902+
struct sk_buff *freeskbs = NULL;
903+
int i, err, txi;
904+
void *start = NULL;
905+
int nlen = 0;
906+
unsigned pending = 0;
793907

794908
if (unlikely(!port))
795909
goto out_dropped;
796910

911+
skb = vnet_skb_shape(skb, &start, &nlen);
912+
913+
if (unlikely(!skb))
914+
goto out_dropped;
915+
797916
spin_lock_irqsave(&port->vio.lock, flags);
798917

799918
dr = &port->vio.drings[VIO_DRIVER_TX_RING];
@@ -811,14 +930,27 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
811930

812931
d = vio_dring_cur(dr);
813932

814-
tx_buf = port->tx_bufs[dr->prod].buf;
815-
skb_copy_from_linear_data(skb, tx_buf + VNET_PACKET_SKIP, skb->len);
933+
txi = dr->prod;
934+
935+
freeskbs = vnet_clean_tx_ring(port, &pending);
936+
937+
BUG_ON(port->tx_bufs[txi].skb);
816938

817939
len = skb->len;
818-
if (len < ETH_ZLEN) {
940+
if (len < ETH_ZLEN)
819941
len = ETH_ZLEN;
820-
memset(tx_buf+VNET_PACKET_SKIP+skb->len, 0, len - skb->len);
942+
943+
port->tx_bufs[txi].skb = skb;
944+
skb = NULL;
945+
946+
err = ldc_map_single(port->vio.lp, start, nlen,
947+
port->tx_bufs[txi].cookies, 2,
948+
(LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_RW));
949+
if (err < 0) {
950+
netdev_info(dev, "tx buffer map error %d\n", err);
951+
goto out_dropped_unlock;
821952
}
953+
port->tx_bufs[txi].ncookies = err;
822954

823955
/* We don't rely on the ACKs to free the skb in vnet_start_xmit(),
824956
* thus it is safe to not set VIO_ACK_ENABLE for each transmission:
@@ -830,9 +962,9 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
830962
*/
831963
d->hdr.ack = VIO_ACK_DISABLE;
832964
d->size = len;
833-
d->ncookies = port->tx_bufs[dr->prod].ncookies;
965+
d->ncookies = port->tx_bufs[txi].ncookies;
834966
for (i = 0; i < d->ncookies; i++)
835-
d->cookies[i] = port->tx_bufs[dr->prod].cookies[i];
967+
d->cookies[i] = port->tx_bufs[txi].cookies[i];
836968

837969
/* This has to be a non-SMP write barrier because we are writing
838970
* to memory which is shared with the peer LDOM.
@@ -876,7 +1008,7 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
8761008
port->start_cons = false;
8771009

8781010
dev->stats.tx_packets++;
879-
dev->stats.tx_bytes += skb->len;
1011+
dev->stats.tx_bytes += port->tx_bufs[txi].skb->len;
8801012

8811013
dr->prod = (dr->prod + 1) & (VNET_TX_RING_SIZE - 1);
8821014
if (unlikely(vnet_tx_dring_avail(dr) < 2)) {
@@ -887,15 +1019,24 @@ static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
8871019

8881020
spin_unlock_irqrestore(&port->vio.lock, flags);
8891021

890-
dev_kfree_skb(skb);
1022+
vnet_free_skbs(freeskbs);
1023+
1024+
(void)mod_timer(&port->clean_timer, jiffies + VNET_CLEAN_TIMEOUT);
8911025

8921026
return NETDEV_TX_OK;
8931027

8941028
out_dropped_unlock:
8951029
spin_unlock_irqrestore(&port->vio.lock, flags);
8961030

8971031
out_dropped:
898-
dev_kfree_skb(skb);
1032+
if (skb)
1033+
dev_kfree_skb(skb);
1034+
vnet_free_skbs(freeskbs);
1035+
if (pending)
1036+
(void)mod_timer(&port->clean_timer,
1037+
jiffies + VNET_CLEAN_TIMEOUT);
1038+
else
1039+
del_timer(&port->clean_timer);
8991040
dev->stats.tx_dropped++;
9001041
return NETDEV_TX_OK;
9011042
}
@@ -1097,17 +1238,22 @@ static void vnet_port_free_tx_bufs(struct vnet_port *port)
10971238
}
10981239

10991240
for (i = 0; i < VNET_TX_RING_SIZE; i++) {
1100-
void *buf = port->tx_bufs[i].buf;
1241+
struct vio_net_desc *d;
1242+
void *skb = port->tx_bufs[i].skb;
11011243

1102-
if (!buf)
1244+
if (!skb)
11031245
continue;
11041246

1247+
d = vio_dring_entry(dr, i);
1248+
if (d->hdr.state == VIO_DESC_READY)
1249+
pr_warn("active transmit buffers freed\n");
1250+
11051251
ldc_unmap(port->vio.lp,
11061252
port->tx_bufs[i].cookies,
11071253
port->tx_bufs[i].ncookies);
1108-
1109-
kfree(buf);
1110-
port->tx_bufs[i].buf = NULL;
1254+
dev_kfree_skb(skb);
1255+
port->tx_bufs[i].skb = NULL;
1256+
d->hdr.state = VIO_DESC_FREE;
11111257
}
11121258
}
11131259

@@ -1118,34 +1264,6 @@ static int vnet_port_alloc_tx_bufs(struct vnet_port *port)
11181264
int i, err, ncookies;
11191265
void *dring;
11201266

1121-
for (i = 0; i < VNET_TX_RING_SIZE; i++) {
1122-
void *buf = kzalloc(VNET_MAXPACKET + 8, GFP_KERNEL);
1123-
int map_len = (VNET_MAXPACKET + 7) & ~7;
1124-
1125-
err = -ENOMEM;
1126-
if (!buf)
1127-
goto err_out;
1128-
1129-
err = -EFAULT;
1130-
if ((unsigned long)buf & (8UL - 1)) {
1131-
pr_err("TX buffer misaligned\n");
1132-
kfree(buf);
1133-
goto err_out;
1134-
}
1135-
1136-
err = ldc_map_single(port->vio.lp, buf, map_len,
1137-
port->tx_bufs[i].cookies, 2,
1138-
(LDC_MAP_SHADOW |
1139-
LDC_MAP_DIRECT |
1140-
LDC_MAP_RW));
1141-
if (err < 0) {
1142-
kfree(buf);
1143-
goto err_out;
1144-
}
1145-
port->tx_bufs[i].buf = buf;
1146-
port->tx_bufs[i].ncookies = err;
1147-
}
1148-
11491267
dr = &port->vio.drings[VIO_DRIVER_TX_RING];
11501268

11511269
len = (VNET_TX_RING_SIZE *
@@ -1172,6 +1290,12 @@ static int vnet_port_alloc_tx_bufs(struct vnet_port *port)
11721290
dr->pending = VNET_TX_RING_SIZE;
11731291
dr->ncookies = ncookies;
11741292

1293+
for (i = 0; i < VNET_TX_RING_SIZE; ++i) {
1294+
struct vio_net_desc *d;
1295+
1296+
d = vio_dring_entry(dr, i);
1297+
d->hdr.state = VIO_DESC_FREE;
1298+
}
11751299
return 0;
11761300

11771301
err_out:
@@ -1203,6 +1327,8 @@ static struct vnet *vnet_new(const u64 *local_mac)
12031327
dev = alloc_etherdev(sizeof(*vp));
12041328
if (!dev)
12051329
return ERR_PTR(-ENOMEM);
1330+
dev->needed_headroom = VNET_PACKET_SKIP + 8;
1331+
dev->needed_tailroom = 8;
12061332

12071333
for (i = 0; i < ETH_ALEN; i++)
12081334
dev->dev_addr[i] = (*local_mac >> (5 - i) * 8) & 0xff;
@@ -1397,6 +1523,9 @@ static int vnet_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
13971523
pr_info("%s: PORT ( remote-mac %pM%s )\n",
13981524
vp->dev->name, port->raddr, switch_port ? " switch-port" : "");
13991525

1526+
setup_timer(&port->clean_timer, vnet_clean_timer_expire,
1527+
(unsigned long)port);
1528+
14001529
vio_port_up(&port->vio);
14011530

14021531
mdesc_release(hp);
@@ -1423,6 +1552,7 @@ static int vnet_port_remove(struct vio_dev *vdev)
14231552
unsigned long flags;
14241553

14251554
del_timer_sync(&port->vio.timer);
1555+
del_timer_sync(&port->clean_timer);
14261556

14271557
spin_lock_irqsave(&vp->lock, flags);
14281558
list_del(&port->list);

drivers/net/ethernet/sun/sunvnet.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@
1111
*/
1212
#define VNET_TX_TIMEOUT (5 * HZ)
1313

14+
/* length of time (or less) we expect pending descriptors to be marked
15+
* as VIO_DESC_DONE and skbs ready to be freed
16+
*/
17+
#define VNET_CLEAN_TIMEOUT ((HZ/100)+1)
18+
1419
#define VNET_MAXPACKET 1518ULL /* ETH_FRAMELEN + VLAN_HDR */
1520
#define VNET_TX_RING_SIZE 512
1621
#define VNET_TX_WAKEUP_THRESH(dr) ((dr)->pending / 4)
@@ -22,7 +27,7 @@
2227
#define VNET_PACKET_SKIP 6
2328

2429
struct vnet_tx_entry {
25-
void *buf;
30+
struct sk_buff *skb;
2631
unsigned int ncookies;
2732
struct ldc_trans_cookie cookies[2];
2833
};
@@ -46,6 +51,8 @@ struct vnet_port {
4651
bool stop_rx;
4752
bool start_cons;
4853

54+
struct timer_list clean_timer;
55+
4956
u64 rmtu;
5057
};
5158

0 commit comments

Comments
 (0)