Skip to content

Commit f35f821

Browse files
edumazetdavem330
authored andcommitted
tcp: defer skb freeing after socket lock is released
tcp recvmsg() (or rx zerocopy) spends a fair amount of time freeing skbs after their payload has been consumed. A typical ~64KB GRO packet has to release ~45 page references, eventually going to page allocator for each of them. Currently, this freeing is performed while socket lock is held, meaning that there is a high chance that BH handler has to queue incoming packets to tcp socket backlog. This can cause additional latencies, because the user thread has to process the backlog at release_sock() time, and while doing so, additional frames can be added by BH handler. This patch adds logic to defer these frees after socket lock is released, or directly from BH handler if possible. Being able to free these skbs from BH handler helps a lot, because this avoids the usual alloc/free assymetry, when BH handler and user thread do not run on same cpu or NUMA node. One cpu can now be fully utilized for the kernel->user copy, and another cpu is handling BH processing and skb/page allocs/frees (assuming RFS is not forcing use of a single CPU) Tested: 100Gbit NIC Max throughput for one TCP_STREAM flow, over 10 runs MTU : 1500 Before: 55 Gbit After: 66 Gbit MTU : 4096+(headers) Before: 82 Gbit After: 95 Gbit Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 3df684c commit f35f821

File tree

6 files changed

+42
-2
lines changed

6 files changed

+42
-2
lines changed

include/linux/skbuff.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include <linux/splice.h>
3737
#include <linux/in6.h>
3838
#include <linux/if_packet.h>
39+
#include <linux/llist.h>
3940
#include <net/flow.h>
4041
#include <net/page_pool.h>
4142
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
@@ -743,6 +744,7 @@ struct sk_buff {
743744
};
744745
struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
745746
struct list_head list;
747+
struct llist_node ll_node;
746748
};
747749

748750
union {

include/net/sock.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
#include <linux/indirect_call_wrapper.h>
6464
#include <linux/atomic.h>
6565
#include <linux/refcount.h>
66+
#include <linux/llist.h>
6667
#include <net/dst.h>
6768
#include <net/checksum.h>
6869
#include <net/tcp_states.h>
@@ -408,6 +409,8 @@ struct sock {
408409
struct sk_buff *head;
409410
struct sk_buff *tail;
410411
} sk_backlog;
412+
struct llist_head defer_list;
413+
411414
#define sk_rmem_alloc sk_backlog.rmem_alloc
412415

413416
int sk_forward_alloc;

include/net/tcp.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1368,6 +1368,16 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb)
13681368
}
13691369

13701370
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
1371+
1372+
void __sk_defer_free_flush(struct sock *sk);
1373+
1374+
static inline void sk_defer_free_flush(struct sock *sk)
1375+
{
1376+
if (llist_empty(&sk->defer_list))
1377+
return;
1378+
__sk_defer_free_flush(sk);
1379+
}
1380+
13711381
int tcp_filter(struct sock *sk, struct sk_buff *skb);
13721382
void tcp_set_state(struct sock *sk, int state);
13731383
void tcp_done(struct sock *sk);

net/ipv4/tcp.c

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1580,14 +1580,34 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
15801580
tcp_send_ack(sk);
15811581
}
15821582

1583+
void __sk_defer_free_flush(struct sock *sk)
1584+
{
1585+
struct llist_node *head;
1586+
struct sk_buff *skb, *n;
1587+
1588+
head = llist_del_all(&sk->defer_list);
1589+
llist_for_each_entry_safe(skb, n, head, ll_node) {
1590+
prefetch(n);
1591+
skb_mark_not_on_list(skb);
1592+
__kfree_skb(skb);
1593+
}
1594+
}
1595+
EXPORT_SYMBOL(__sk_defer_free_flush);
1596+
15831597
static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
15841598
{
1599+
__skb_unlink(skb, &sk->sk_receive_queue);
15851600
if (likely(skb->destructor == sock_rfree)) {
15861601
sock_rfree(skb);
15871602
skb->destructor = NULL;
15881603
skb->sk = NULL;
1604+
if (!skb_queue_empty(&sk->sk_receive_queue) ||
1605+
!llist_empty(&sk->defer_list)) {
1606+
llist_add(&skb->ll_node, &sk->defer_list);
1607+
return;
1608+
}
15891609
}
1590-
sk_eat_skb(sk, skb);
1610+
__kfree_skb(skb);
15911611
}
15921612

15931613
static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
@@ -2422,6 +2442,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
24222442
/* Do not sleep, just process backlog. */
24232443
__sk_flush_backlog(sk);
24242444
} else {
2445+
sk_defer_free_flush(sk);
24252446
sk_wait_data(sk, &timeo, last);
24262447
}
24272448

@@ -2540,6 +2561,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
25402561
ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss,
25412562
&cmsg_flags);
25422563
release_sock(sk);
2564+
sk_defer_free_flush(sk);
25432565

25442566
if (cmsg_flags && ret >= 0) {
25452567
if (cmsg_flags & TCP_CMSG_TS)
@@ -3065,7 +3087,7 @@ int tcp_disconnect(struct sock *sk, int flags)
30653087
sk->sk_frag.page = NULL;
30663088
sk->sk_frag.offset = 0;
30673089
}
3068-
3090+
sk_defer_free_flush(sk);
30693091
sk_error_report(sk);
30703092
return 0;
30713093
}
@@ -4194,6 +4216,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
41944216
err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
41954217
&zc, &len, err);
41964218
release_sock(sk);
4219+
sk_defer_free_flush(sk);
41974220
if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
41984221
goto zerocopy_rcv_cmsg;
41994222
switch (len) {

net/ipv4/tcp_ipv4.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2102,6 +2102,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
21022102

21032103
sk_incoming_cpu_update(sk);
21042104

2105+
sk_defer_free_flush(sk);
21052106
bh_lock_sock_nested(sk);
21062107
tcp_segs_in(tcp_sk(sk), skb);
21072108
ret = 0;

net/ipv6/tcp_ipv6.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1758,6 +1758,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
17581758

17591759
sk_incoming_cpu_update(sk);
17601760

1761+
sk_defer_free_flush(sk);
17611762
bh_lock_sock_nested(sk);
17621763
tcp_segs_in(tcp_sk(sk), skb);
17631764
ret = 0;

0 commit comments

Comments
 (0)