Skip to content

Commit 9cacf81

Browse files
fomichevAlexei Starovoitov
authored andcommitted
bpf: Remove extra lock_sock for TCP_ZEROCOPY_RECEIVE
Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE. We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom call in do_tcp_getsockopt using the on-stack data. This removes 3% overhead for locking/unlocking the socket. Without this patch: 3.38% 0.07% tcp_mmap [kernel.kallsyms] [k] __cgroup_bpf_run_filter_getsockopt | --3.30%--__cgroup_bpf_run_filter_getsockopt | --0.81%--__kmalloc With the patch applied: 0.52% 0.12% tcp_mmap [kernel.kallsyms] [k] __cgroup_bpf_run_filter_getsockopt_kern Note, exporting uapi/tcp.h requires removing netinet/tcp.h from test_progs.h because those headers have confliciting definitions. Signed-off-by: Stanislav Fomichev <[email protected]> Signed-off-by: Alexei Starovoitov <[email protected]> Acked-by: Martin KaFai Lau <[email protected]> Link: https://lore.kernel.org/bpf/[email protected]
1 parent 13ca51d commit 9cacf81

File tree

16 files changed

+506
-7
lines changed

16 files changed

+506
-7
lines changed

include/linux/bpf-cgroup.h

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
147147
int __user *optlen, int max_optlen,
148148
int retval);
149149

150+
int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
151+
int optname, void *optval,
152+
int *optlen, int retval);
153+
150154
static inline enum bpf_cgroup_storage_type cgroup_storage_type(
151155
struct bpf_map *map)
152156
{
@@ -364,10 +368,23 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
364368
({ \
365369
int __ret = retval; \
366370
if (cgroup_bpf_enabled) \
367-
__ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \
368-
optname, optval, \
369-
optlen, max_optlen, \
370-
retval); \
371+
if (!(sock)->sk_prot->bpf_bypass_getsockopt || \
372+
!INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \
373+
tcp_bpf_bypass_getsockopt, \
374+
level, optname)) \
375+
__ret = __cgroup_bpf_run_filter_getsockopt( \
376+
sock, level, optname, optval, optlen, \
377+
max_optlen, retval); \
378+
__ret; \
379+
})
380+
381+
#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \
382+
optlen, retval) \
383+
({ \
384+
int __ret = retval; \
385+
if (cgroup_bpf_enabled) \
386+
__ret = __cgroup_bpf_run_filter_getsockopt_kern( \
387+
sock, level, optname, optval, optlen, retval); \
371388
__ret; \
372389
})
373390

@@ -452,6 +469,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
452469
#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; })
453470
#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \
454471
optlen, max_optlen, retval) ({ retval; })
472+
#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \
473+
optlen, retval) ({ retval; })
455474
#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
456475
kernel_optval) ({ 0; })
457476

include/linux/indirect_call_wrapper.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,4 +60,10 @@
6060
#define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__)
6161
#endif
6262

63+
#if IS_ENABLED(CONFIG_INET)
64+
#define INDIRECT_CALL_INET_1(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__)
65+
#else
66+
#define INDIRECT_CALL_INET_1(f, f1, ...) f(__VA_ARGS__)
67+
#endif
68+
6369
#endif

include/net/sock.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,6 +1174,8 @@ struct proto {
11741174

11751175
int (*backlog_rcv) (struct sock *sk,
11761176
struct sk_buff *skb);
1177+
bool (*bpf_bypass_getsockopt)(int level,
1178+
int optname);
11771179

11781180
void (*release_cb)(struct sock *sk);
11791181

include/net/tcp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock,
403403
struct poll_table_struct *wait);
404404
int tcp_getsockopt(struct sock *sk, int level, int optname,
405405
char __user *optval, int __user *optlen);
406+
bool tcp_bpf_bypass_getsockopt(int level, int optname);
406407
int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
407408
unsigned int optlen);
408409
void tcp_set_keepalive(struct sock *sk, int val);

kernel/bpf/cgroup.c

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1486,6 +1486,52 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
14861486
sockopt_free_buf(&ctx);
14871487
return ret;
14881488
}
1489+
1490+
int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
1491+
int optname, void *optval,
1492+
int *optlen, int retval)
1493+
{
1494+
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1495+
struct bpf_sockopt_kern ctx = {
1496+
.sk = sk,
1497+
.level = level,
1498+
.optname = optname,
1499+
.retval = retval,
1500+
.optlen = *optlen,
1501+
.optval = optval,
1502+
.optval_end = optval + *optlen,
1503+
};
1504+
int ret;
1505+
1506+
/* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
1507+
* user data back into BPF buffer when reval != 0. This is
1508+
* done as an optimization to avoid extra copy, assuming
1509+
* kernel won't populate the data in case of an error.
1510+
* Here we always pass the data and memset() should
1511+
* be called if that data shouldn't be "exported".
1512+
*/
1513+
1514+
ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
1515+
&ctx, BPF_PROG_RUN);
1516+
if (!ret)
1517+
return -EPERM;
1518+
1519+
if (ctx.optlen > *optlen)
1520+
return -EFAULT;
1521+
1522+
/* BPF programs only allowed to set retval to 0, not some
1523+
* arbitrary value.
1524+
*/
1525+
if (ctx.retval != 0 && ctx.retval != retval)
1526+
return -EFAULT;
1527+
1528+
/* BPF programs can shrink the buffer, export the modifications.
1529+
*/
1530+
if (ctx.optlen != 0)
1531+
*optlen = ctx.optlen;
1532+
1533+
return ctx.retval;
1534+
}
14891535
#endif
14901536

14911537
static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,

net/ipv4/tcp.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4099,6 +4099,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
40994099
return -EFAULT;
41004100
lock_sock(sk);
41014101
err = tcp_zerocopy_receive(sk, &zc);
4102+
err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
4103+
&zc, &len, err);
41024104
release_sock(sk);
41034105
if (len >= offsetofend(struct tcp_zerocopy_receive, err))
41044106
goto zerocopy_rcv_sk_err;
@@ -4133,6 +4135,18 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
41334135
return 0;
41344136
}
41354137

4138+
bool tcp_bpf_bypass_getsockopt(int level, int optname)
4139+
{
4140+
/* TCP do_tcp_getsockopt has optimized getsockopt implementation
4141+
* to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
4142+
*/
4143+
if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
4144+
return true;
4145+
4146+
return false;
4147+
}
4148+
EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt);
4149+
41364150
int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
41374151
int __user *optlen)
41384152
{

net/ipv4/tcp_ipv4.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2793,6 +2793,7 @@ struct proto tcp_prot = {
27932793
.shutdown = tcp_shutdown,
27942794
.setsockopt = tcp_setsockopt,
27952795
.getsockopt = tcp_getsockopt,
2796+
.bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
27962797
.keepalive = tcp_set_keepalive,
27972798
.recvmsg = tcp_recvmsg,
27982799
.sendmsg = tcp_sendmsg,

net/ipv6/tcp_ipv6.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2121,6 +2121,7 @@ struct proto tcpv6_prot = {
21212121
.shutdown = tcp_shutdown,
21222122
.setsockopt = tcp_setsockopt,
21232123
.getsockopt = tcp_getsockopt,
2124+
.bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
21242125
.keepalive = tcp_set_keepalive,
21252126
.recvmsg = tcp_recvmsg,
21262127
.sendmsg = tcp_sendmsg,

net/socket.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2126,6 +2126,9 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
21262126
return __sys_setsockopt(fd, level, optname, optval, optlen);
21272127
}
21282128

2129+
INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level,
2130+
int optname));
2131+
21292132
/*
21302133
* Get a socket option. Because we don't know the option lengths we have
21312134
* to pass a user mode parameter for the protocols to sort out.

0 commit comments

Comments
 (0)