Skip to content

Commit e9ad5fd

Browse files
mrpreKernel Patches Daemon
authored andcommitted
bpf, sockmap: Fix FIONREAD for sockmap
A socket using sockmap has its own independent receive queue: ingress_msg. This queue may contain data from its own protocol stack or from other sockets. Therefore, for sockmap, relying solely on copied_seq and rcv_nxt to calculate FIONREAD is not enough. This patch adds a new ingress_size field in the psock structure to record the data length in ingress_msg. Additionally, we implement new ioctl interfaces for TCP and UDP to intercept FIONREAD operations. While Unix and VSOCK also support sockmap and have similar FIONREAD calculation issues, fixing them would require more extensive changes (please let me know if modifications are needed). I believe it's not appropriate to include those changes under this fix patch. Previous work by John Fastabend made some efforts towards FIONREAD support: commit e5c6de5 ("bpf, sockmap: Incorrectly handling copied_seq") Although the current patch is based on the previous work by John Fastabend, it is acceptable for our Fixes tag to point to the same commit. FD1:read() -- FD1->copied_seq++ | [read data] | [enqueue data] v [sockmap] -> ingress to self -> ingress_msg queue FD1 native stack ------> ^ -- FD1->rcv_nxt++ -> redirect to other | [enqueue data] | | | ingress to FD1 v ^ ... | [sockmap] FD2 native stack Fixes: 04919be ("tcp: Introduce tcp_read_skb()") Signed-off-by: Jiayuan Chen <[email protected]>
1 parent 90361ff commit e9ad5fd

File tree

4 files changed

+88
-5
lines changed

4 files changed

+88
-5
lines changed

include/linux/skmsg.h

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ struct sk_psock {
108108
struct sk_buff_head ingress_skb;
109109
struct list_head ingress_msg;
110110
spinlock_t ingress_lock;
111+
ssize_t ingress_size;
111112
unsigned long state;
112113
struct list_head link;
113114
spinlock_t link_lock;
@@ -342,6 +343,16 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
342343
kfree_skb(skb);
343344
}
344345

346+
static inline ssize_t sk_psock_get_msg_size(struct sk_psock *psock)
347+
{
348+
return psock->ingress_size;
349+
}
350+
351+
static inline void sk_psock_inc_msg_size(struct sk_psock *psock, ssize_t diff)
352+
{
353+
psock->ingress_size += diff;
354+
}
355+
345356
static inline bool sk_psock_queue_msg(struct sk_psock *psock,
346357
struct sk_msg *msg)
347358
{
@@ -350,6 +361,7 @@ static inline bool sk_psock_queue_msg(struct sk_psock *psock,
350361
spin_lock_bh(&psock->ingress_lock);
351362
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
352363
list_add_tail(&msg->list, &psock->ingress_msg);
364+
sk_psock_inc_msg_size(psock, msg->sg.size);
353365
ret = true;
354366
} else {
355367
sk_msg_free(psock->sk, msg);
@@ -366,8 +378,10 @@ static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
366378

367379
spin_lock_bh(&psock->ingress_lock);
368380
msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
369-
if (msg)
381+
if (msg) {
370382
list_del(&msg->list);
383+
sk_psock_inc_msg_size(psock, -msg->sg.size);
384+
}
371385
spin_unlock_bh(&psock->ingress_lock);
372386
return msg;
373387
}
@@ -544,6 +558,36 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
544558
return !!psock->saved_data_ready;
545559
}
546560

561+
static inline ssize_t sk_psock_msg_inq(struct sock *sk)
562+
{
563+
struct sk_psock *psock;
564+
ssize_t inq = 0;
565+
566+
psock = sk_psock_get(sk);
567+
if (likely(psock)) {
568+
inq = sk_psock_get_msg_size(psock);
569+
sk_psock_put(sk, psock);
570+
}
571+
return inq;
572+
}
573+
574+
/* for udp */
575+
static inline ssize_t sk_msg_first_length(struct sock *sk)
576+
{
577+
struct sk_psock *psock;
578+
struct sk_msg *msg;
579+
ssize_t inq = 0;
580+
581+
psock = sk_psock_get(sk);
582+
if (likely(psock)) {
583+
msg = sk_psock_peek_msg(psock);
584+
if (msg)
585+
inq = msg->sg.size;
586+
sk_psock_put(sk, psock);
587+
}
588+
return inq;
589+
}
590+
547591
#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
548592

549593
#define BPF_F_STRPARSER (1UL << 1)

net/core/skmsg.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,7 @@ int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg
455455
atomic_sub(copy, &sk->sk_rmem_alloc);
456456
}
457457
msg_rx->sg.size -= copy;
458+
sk_psock_inc_msg_size(psock, -copy);
458459

459460
if (!sge->length) {
460461
sk_msg_iter_var_next(i);

net/ipv4/tcp_bpf.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <net/inet_common.h>
1212
#include <net/tls.h>
13+
#include <asm/ioctls.h>
1314

1415
void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
1516
{
@@ -332,6 +333,25 @@ static int tcp_bpf_recvmsg_parser(struct sock *sk,
332333
return copied;
333334
}
334335

336+
static int tcp_bpf_ioctl(struct sock *sk, int cmd, int *karg)
337+
{
338+
bool slow;
339+
340+
/* we only care about FIONREAD */
341+
if (cmd != SIOCINQ)
342+
return tcp_ioctl(sk, cmd, karg);
343+
344+
/* works similar as tcp_ioctl */
345+
if (sk->sk_state == TCP_LISTEN)
346+
return -EINVAL;
347+
348+
slow = lock_sock_fast(sk);
349+
*karg = sk_psock_msg_inq(sk);
350+
unlock_sock_fast(sk, slow);
351+
352+
return 0;
353+
}
354+
335355
static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
336356
int flags, int *addr_len)
337357
{
@@ -610,6 +630,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
610630
prot[TCP_BPF_BASE].close = sock_map_close;
611631
prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
612632
prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable;
633+
prot[TCP_BPF_BASE].ioctl = tcp_bpf_ioctl;
613634

614635
prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
615636
prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;

net/ipv4/udp_bpf.c

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <net/sock.h>
66
#include <net/udp.h>
77
#include <net/inet_common.h>
8+
#include <asm/ioctls.h>
89

910
#include "udp_impl.h"
1011

@@ -111,12 +112,28 @@ enum {
111112
static DEFINE_SPINLOCK(udpv6_prot_lock);
112113
static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
113114

115+
static int udp_bpf_ioctl(struct sock *sk, int cmd, int *karg)
116+
{
117+
/* we only care about FIONREAD */
118+
if (cmd != SIOCINQ)
119+
return tcp_ioctl(sk, cmd, karg);
120+
121+
/* works similar as udp_ioctl.
122+
* man udp(7): "FIONREAD (SIOCINQ): Returns the size of the next
123+
* pending datagram in the integer in bytes, or 0 when no datagram
124+
* is pending."
125+
*/
126+
*karg = sk_msg_first_length(sk);
127+
return 0;
128+
}
129+
114130
static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
115131
{
116-
*prot = *base;
117-
prot->close = sock_map_close;
118-
prot->recvmsg = udp_bpf_recvmsg;
119-
prot->sock_is_readable = sk_msg_is_readable;
132+
*prot = *base;
133+
prot->close = sock_map_close;
134+
prot->recvmsg = udp_bpf_recvmsg;
135+
prot->sock_is_readable = sk_msg_is_readable;
136+
prot->ioctl = udp_bpf_ioctl;
120137
}
121138

122139
static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)

0 commit comments

Comments
 (0)