Skip to content

Commit c58b155

Browse files
author
Alexei Starovoitov
committed
Merge branch 'bpf_sk_assign'
Joe Stringer says: ==================== Introduce a new helper that allows assigning a previously-found socket to the skb as the packet is received towards the stack, to cause the stack to guide the packet towards that socket subject to local routing configuration. The intention is to support TProxy use cases more directly from eBPF programs attached at TC ingress, to simplify and streamline Linux stack configuration in scale environments with Cilium. Normally in ip{,6}_rcv_core(), the skb will be orphaned, dropping any existing socket reference associated with the skb. Existing tproxy implementations in netfilter get around this restriction by running the tproxy logic after ip_rcv_core() in the PREROUTING table. However, this is not an option for TC-based logic (including eBPF programs attached at TC ingress). This series introduces the BPF helper bpf_sk_assign() to associate the socket with the skb on the ingress path as the packet is passed up the stack. The initial patch in the series simply takes a reference on the socket to ensure safety, but later patches relax this for listen sockets. To ensure delivery to the relevant socket, we still consult the routing table, for full examples of how to configure see the tests in patch #5; the simplest form of the route would look like this: $ ip route add local default dev lo This series is laid out as follows: * Patch 1 extends the eBPF API to add sk_assign() and defines a new socket free function to allow the later paths to understand when the socket associated with the skb should be kept through receive. * Patches 2-3 optimize the receive path to avoid taking a reference on listener sockets during receive. * Patches 4-5 extends the selftests with examples of the new functionality and validation of correct behaviour. Changes since v4: * Fix build with CONFIG_INET disabled * Rebase Changes since v3: * Use sock_gen_put() directly instead of sock_edemux() from sock_pfree() * Commit message wording fixups * Add acks from Martin, Lorenz * Rebase Changes since v2: * Add selftests for UDP socket redirection * Drop the early demux optimization patch (defer for more testing) * Fix check for orphaning after TC act return * Tidy up the tests to clean up properly and be less noisy. Changes since v1: * Replace the metadata_dst approach with using the skb->destructor to determine whether the socket has been prefetched. This is much simpler. * Avoid taking a reference on listener sockets during receive * Restrict assigning sockets across namespaces * Restrict assigning SO_REUSEPORT sockets * Fix cookie usage for socket dst check * Rebase the tests against test_progs infrastructure * Tidy up commit messages ==================== Signed-off-by: Alexei Starovoitov <[email protected]>
2 parents b49e42a + 8a02a17 commit c58b155

File tree

14 files changed

+662
-24
lines changed

14 files changed

+662
-24
lines changed

include/net/inet6_hashtables.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,8 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
8585
int iif, int sdif,
8686
bool *refcounted)
8787
{
88-
struct sock *sk = skb_steal_sock(skb);
88+
struct sock *sk = skb_steal_sock(skb, refcounted);
8989

90-
*refcounted = true;
9190
if (sk)
9291
return sk;
9392

include/net/inet_hashtables.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -379,10 +379,9 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
379379
const int sdif,
380380
bool *refcounted)
381381
{
382-
struct sock *sk = skb_steal_sock(skb);
382+
struct sock *sk = skb_steal_sock(skb, refcounted);
383383
const struct iphdr *iph = ip_hdr(skb);
384384

385-
*refcounted = true;
386385
if (sk)
387386
return sk;
388387

include/net/sock.h

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1659,6 +1659,7 @@ void sock_rfree(struct sk_buff *skb);
16591659
void sock_efree(struct sk_buff *skb);
16601660
#ifdef CONFIG_INET
16611661
void sock_edemux(struct sk_buff *skb);
1662+
void sock_pfree(struct sk_buff *skb);
16621663
#else
16631664
#define sock_edemux sock_efree
16641665
#endif
@@ -2526,16 +2527,14 @@ void sock_net_set(struct sock *sk, struct net *net)
25262527
write_pnet(&sk->sk_net, net);
25272528
}
25282529

2529-
static inline struct sock *skb_steal_sock(struct sk_buff *skb)
2530+
static inline bool
2531+
skb_sk_is_prefetched(struct sk_buff *skb)
25302532
{
2531-
if (skb->sk) {
2532-
struct sock *sk = skb->sk;
2533-
2534-
skb->destructor = NULL;
2535-
skb->sk = NULL;
2536-
return sk;
2537-
}
2538-
return NULL;
2533+
#ifdef CONFIG_INET
2534+
return skb->destructor == sock_pfree;
2535+
#else
2536+
return false;
2537+
#endif /* CONFIG_INET */
25392538
}
25402539

25412540
/* This helper checks if a socket is a full socket,
@@ -2546,6 +2545,35 @@ static inline bool sk_fullsock(const struct sock *sk)
25462545
return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
25472546
}
25482547

2548+
static inline bool
2549+
sk_is_refcounted(struct sock *sk)
2550+
{
2551+
/* Only full sockets have sk->sk_flags. */
2552+
return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
2553+
}
2554+
2555+
/**
2556+
* skb_steal_sock
2557+
* @skb to steal the socket from
2558+
* @refcounted is set to true if the socket is reference-counted
2559+
*/
2560+
static inline struct sock *
2561+
skb_steal_sock(struct sk_buff *skb, bool *refcounted)
2562+
{
2563+
if (skb->sk) {
2564+
struct sock *sk = skb->sk;
2565+
2566+
*refcounted = true;
2567+
if (skb_sk_is_prefetched(skb))
2568+
*refcounted = sk_is_refcounted(sk);
2569+
skb->destructor = NULL;
2570+
skb->sk = NULL;
2571+
return sk;
2572+
}
2573+
*refcounted = false;
2574+
return NULL;
2575+
}
2576+
25492577
/* Checks if this SKB belongs to an HW offloaded socket
25502578
* and whether any SW fallbacks are required based on dev.
25512579
* Check decrypted mark in case skb_orphan() cleared socket.

include/uapi/linux/bpf.h

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2983,6 +2983,28 @@ union bpf_attr {
29832983
* **bpf_get_current_cgroup_id**\ ().
29842984
* Return
29852985
* The id is returned or 0 in case the id could not be retrieved.
2986+
*
2987+
* int bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
2988+
* Description
2989+
* Assign the *sk* to the *skb*. When combined with appropriate
2990+
* routing configuration to receive the packet towards the socket,
2991+
* will cause *skb* to be delivered to the specified socket.
2992+
* Subsequent redirection of *skb* via **bpf_redirect**\ (),
2993+
* **bpf_clone_redirect**\ () or other methods outside of BPF may
2994+
* interfere with successful delivery to the socket.
2995+
*
2996+
* This operation is only valid from TC ingress path.
2997+
*
2998+
* The *flags* argument must be zero.
2999+
* Return
3000+
* 0 on success, or a negative errno in case of failure.
3001+
*
3002+
* * **-EINVAL** Unsupported flags specified.
3003+
* * **-ENOENT** Socket is unavailable for assignment.
3004+
* * **-ENETUNREACH** Socket is unreachable (wrong netns).
3005+
* * **-EOPNOTSUPP** Unsupported operation, for example a
3006+
* call from outside of TC ingress.
3007+
* * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport).
29863008
*/
29873009
#define __BPF_FUNC_MAPPER(FN) \
29883010
FN(unspec), \
@@ -3108,7 +3130,8 @@ union bpf_attr {
31083130
FN(get_ns_current_pid_tgid), \
31093131
FN(xdp_output), \
31103132
FN(get_netns_cookie), \
3111-
FN(get_current_ancestor_cgroup_id),
3133+
FN(get_current_ancestor_cgroup_id), \
3134+
FN(sk_assign),
31123135

31133136
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
31143137
* function eBPF program intends to call

net/core/filter.c

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5401,8 +5401,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
54015401

54025402
BPF_CALL_1(bpf_sk_release, struct sock *, sk)
54035403
{
5404-
/* Only full sockets have sk->sk_flags. */
5405-
if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE))
5404+
if (sk_is_refcounted(sk))
54065405
sock_gen_put(sk);
54075406
return 0;
54085407
}
@@ -5918,6 +5917,36 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
59185917
.arg5_type = ARG_CONST_SIZE,
59195918
};
59205919

5920+
BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
5921+
{
5922+
if (flags != 0)
5923+
return -EINVAL;
5924+
if (!skb_at_tc_ingress(skb))
5925+
return -EOPNOTSUPP;
5926+
if (unlikely(dev_net(skb->dev) != sock_net(sk)))
5927+
return -ENETUNREACH;
5928+
if (unlikely(sk->sk_reuseport))
5929+
return -ESOCKTNOSUPPORT;
5930+
if (sk_is_refcounted(sk) &&
5931+
unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
5932+
return -ENOENT;
5933+
5934+
skb_orphan(skb);
5935+
skb->sk = sk;
5936+
skb->destructor = sock_pfree;
5937+
5938+
return 0;
5939+
}
5940+
5941+
static const struct bpf_func_proto bpf_sk_assign_proto = {
5942+
.func = bpf_sk_assign,
5943+
.gpl_only = false,
5944+
.ret_type = RET_INTEGER,
5945+
.arg1_type = ARG_PTR_TO_CTX,
5946+
.arg2_type = ARG_PTR_TO_SOCK_COMMON,
5947+
.arg3_type = ARG_ANYTHING,
5948+
};
5949+
59215950
#endif /* CONFIG_INET */
59225951

59235952
bool bpf_helper_changes_pkt_data(void *func)
@@ -6249,6 +6278,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
62496278
return &bpf_skb_ecn_set_ce_proto;
62506279
case BPF_FUNC_tcp_gen_syncookie:
62516280
return &bpf_tcp_gen_syncookie_proto;
6281+
case BPF_FUNC_sk_assign:
6282+
return &bpf_sk_assign_proto;
62526283
#endif
62536284
default:
62546285
return bpf_base_func_proto(func_id);

net/core/sock.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2071,6 +2071,18 @@ void sock_efree(struct sk_buff *skb)
20712071
}
20722072
EXPORT_SYMBOL(sock_efree);
20732073

2074+
/* Buffer destructor for prefetch/receive path where reference count may
2075+
* not be held, e.g. for listen sockets.
2076+
*/
2077+
#ifdef CONFIG_INET
2078+
void sock_pfree(struct sk_buff *skb)
2079+
{
2080+
if (sk_is_refcounted(skb->sk))
2081+
sock_gen_put(skb->sk);
2082+
}
2083+
EXPORT_SYMBOL(sock_pfree);
2084+
#endif /* CONFIG_INET */
2085+
20742086
kuid_t sock_i_uid(struct sock *sk)
20752087
{
20762088
kuid_t uid;

net/ipv4/ip_input.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,8 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
509509
IPCB(skb)->iif = skb->skb_iif;
510510

511511
/* Must drop socket now because of tproxy. */
512-
skb_orphan(skb);
512+
if (!skb_sk_is_prefetched(skb))
513+
skb_orphan(skb);
513514

514515
return skb;
515516

net/ipv4/udp.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2288,6 +2288,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
22882288
struct rtable *rt = skb_rtable(skb);
22892289
__be32 saddr, daddr;
22902290
struct net *net = dev_net(skb->dev);
2291+
bool refcounted;
22912292

22922293
/*
22932294
* Validate the packet.
@@ -2313,7 +2314,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
23132314
if (udp4_csum_init(skb, uh, proto))
23142315
goto csum_error;
23152316

2316-
sk = skb_steal_sock(skb);
2317+
sk = skb_steal_sock(skb, &refcounted);
23172318
if (sk) {
23182319
struct dst_entry *dst = skb_dst(skb);
23192320
int ret;
@@ -2322,7 +2323,8 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
23222323
udp_sk_rx_dst_set(sk, dst);
23232324

23242325
ret = udp_unicast_rcv_skb(sk, skb, uh);
2325-
sock_put(sk);
2326+
if (refcounted)
2327+
sock_put(sk);
23262328
return ret;
23272329
}
23282330

net/ipv6/ip6_input.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
285285
rcu_read_unlock();
286286

287287
/* Must drop socket now because of tproxy. */
288-
skb_orphan(skb);
288+
if (!skb_sk_is_prefetched(skb))
289+
skb_orphan(skb);
289290

290291
return skb;
291292
err:

net/ipv6/udp.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -843,6 +843,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
843843
struct net *net = dev_net(skb->dev);
844844
struct udphdr *uh;
845845
struct sock *sk;
846+
bool refcounted;
846847
u32 ulen = 0;
847848

848849
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
@@ -879,7 +880,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
879880
goto csum_error;
880881

881882
/* Check if the socket is already available, e.g. due to early demux */
882-
sk = skb_steal_sock(skb);
883+
sk = skb_steal_sock(skb, &refcounted);
883884
if (sk) {
884885
struct dst_entry *dst = skb_dst(skb);
885886
int ret;
@@ -888,12 +889,14 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
888889
udp6_sk_rx_dst_set(sk, dst);
889890

890891
if (!uh->check && !udp_sk(sk)->no_check6_rx) {
891-
sock_put(sk);
892+
if (refcounted)
893+
sock_put(sk);
892894
goto report_csum_error;
893895
}
894896

895897
ret = udp6_unicast_rcv_skb(sk, skb, uh);
896-
sock_put(sk);
898+
if (refcounted)
899+
sock_put(sk);
897900
return ret;
898901
}
899902

0 commit comments

Comments
 (0)