Skip to content

Commit ea4ca58

Browse files
Paolo Abenikuba-moo
authored andcommitted
mptcp: refine MPTCP-level ack scheduling
Send timely MPTCP-level ack is somewhat difficult when the insertion into the msk receive level is performed by the worker. It needs TCP-level dup-ack to notify the MPTCP-level ack_seq increase, as both the TCP-level ack seq and the rcv window are unchanged. We can actually avoid processing incoming data with the worker, and let the subflow or recevmsg() send ack as needed. When recvmsg() moves the skbs inside the msk receive queue, the msk space is still unchanged, so tcp_cleanup_rbuf() could end-up skipping TCP-level ack generation. Anyway, when __mptcp_move_skbs() is invoked, a known amount of bytes is going to be consumed soon: we update rcv wnd computation taking them in account. Additionally we need to explicitly trigger tcp_cleanup_rbuf() when recvmsg() consumes a significant amount of the receive buffer. Signed-off-by: Paolo Abeni <[email protected]> Signed-off-by: Mat Martineau <[email protected]> Signed-off-by: Jakub Kicinski <[email protected]>
1 parent fa3fe2b commit ea4ca58

File tree

4 files changed

+61
-57
lines changed

4 files changed

+61
-57
lines changed

net/mptcp/options.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
530530
opts->ext_copy.ack64 = 0;
531531
}
532532
opts->ext_copy.use_ack = 1;
533+
WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));
533534

534535
/* Add kind/length/subtype/flag overhead if mapping is not populated */
535536
if (dss_size == 0)

net/mptcp/protocol.c

Lines changed: 51 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -407,16 +407,42 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
407407
mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
408408
}
409409

410-
static void mptcp_send_ack(struct mptcp_sock *msk)
410+
static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
411+
{
412+
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
413+
414+
/* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
415+
if (subflow->request_join && !subflow->fully_established)
416+
return false;
417+
418+
/* only send if our side has not closed yet */
419+
return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
420+
}
421+
422+
static void mptcp_send_ack(struct mptcp_sock *msk, bool force)
411423
{
412424
struct mptcp_subflow_context *subflow;
425+
struct sock *pick = NULL;
413426

414427
mptcp_for_each_subflow(msk, subflow) {
415428
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
416429

417-
lock_sock(ssk);
418-
tcp_send_ack(ssk);
419-
release_sock(ssk);
430+
if (force) {
431+
lock_sock(ssk);
432+
tcp_send_ack(ssk);
433+
release_sock(ssk);
434+
continue;
435+
}
436+
437+
/* if the hintes ssk is still active, use it */
438+
pick = ssk;
439+
if (ssk == msk->ack_hint)
440+
break;
441+
}
442+
if (!force && pick) {
443+
lock_sock(pick);
444+
tcp_cleanup_rbuf(pick, 1);
445+
release_sock(pick);
420446
}
421447
}
422448

@@ -468,7 +494,7 @@ static bool mptcp_check_data_fin(struct sock *sk)
468494

469495
ret = true;
470496
mptcp_set_timeout(sk, NULL);
471-
mptcp_send_ack(msk);
497+
mptcp_send_ack(msk, true);
472498
mptcp_close_wake_up(sk);
473499
}
474500
return ret;
@@ -483,7 +509,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
483509
unsigned int moved = 0;
484510
bool more_data_avail;
485511
struct tcp_sock *tp;
486-
u32 old_copied_seq;
487512
bool done = false;
488513
int sk_rbuf;
489514

@@ -500,7 +525,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
500525

501526
pr_debug("msk=%p ssk=%p", msk, ssk);
502527
tp = tcp_sk(ssk);
503-
old_copied_seq = tp->copied_seq;
504528
do {
505529
u32 map_remaining, offset;
506530
u32 seq = tp->copied_seq;
@@ -564,11 +588,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
564588
break;
565589
}
566590
} while (more_data_avail);
591+
msk->ack_hint = ssk;
567592

568593
*bytes += moved;
569-
if (tp->copied_seq != old_copied_seq)
570-
tcp_cleanup_rbuf(ssk, 1);
571-
572594
return done;
573595
}
574596

@@ -672,19 +694,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
672694
if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf)
673695
goto wake;
674696

675-
if (move_skbs_to_msk(msk, ssk))
676-
goto wake;
697+
move_skbs_to_msk(msk, ssk);
677698

678-
/* mptcp socket is owned, release_cb should retry */
679-
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
680-
&sk->sk_tsq_flags)) {
681-
sock_hold(sk);
682-
683-
/* need to try again, its possible release_cb() has already
684-
* been called after the test_and_set_bit() above.
685-
*/
686-
move_skbs_to_msk(msk, ssk);
687-
}
688699
wake:
689700
if (wake)
690701
sk->sk_data_ready(sk);
@@ -1095,18 +1106,6 @@ static void mptcp_nospace(struct mptcp_sock *msk)
10951106
mptcp_clean_una((struct sock *)msk);
10961107
}
10971108

1098-
static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
1099-
{
1100-
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1101-
1102-
/* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
1103-
if (subflow->request_join && !subflow->fully_established)
1104-
return false;
1105-
1106-
/* only send if our side has not closed yet */
1107-
return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
1108-
}
1109-
11101109
#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
11111110
sizeof(struct tcphdr) - \
11121111
MAX_TCP_OPTION_SPACE - \
@@ -1534,7 +1533,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
15341533
msk->rcvq_space.time = mstamp;
15351534
}
15361535

1537-
static bool __mptcp_move_skbs(struct mptcp_sock *msk)
1536+
static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv)
15381537
{
15391538
unsigned int moved = 0;
15401539
bool done;
@@ -1553,12 +1552,16 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
15531552

15541553
slowpath = lock_sock_fast(ssk);
15551554
done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
1555+
if (moved && rcv) {
1556+
WRITE_ONCE(msk->rmem_pending, min(rcv, moved));
1557+
tcp_cleanup_rbuf(ssk, 1);
1558+
WRITE_ONCE(msk->rmem_pending, 0);
1559+
}
15561560
unlock_sock_fast(ssk, slowpath);
15571561
} while (!done);
15581562

15591563
if (mptcp_ofo_queue(msk) || moved > 0) {
1560-
if (!mptcp_check_data_fin((struct sock *)msk))
1561-
mptcp_send_ack(msk);
1564+
mptcp_check_data_fin((struct sock *)msk);
15621565
return true;
15631566
}
15641567
return false;
@@ -1582,8 +1585,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
15821585
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
15831586
__mptcp_flush_join_list(msk);
15841587

1585-
while (len > (size_t)copied) {
1586-
int bytes_read;
1588+
for (;;) {
1589+
int bytes_read, old_space;
15871590

15881591
bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
15891592
if (unlikely(bytes_read < 0)) {
@@ -1595,9 +1598,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
15951598
copied += bytes_read;
15961599

15971600
if (skb_queue_empty(&sk->sk_receive_queue) &&
1598-
__mptcp_move_skbs(msk))
1601+
__mptcp_move_skbs(msk, len - copied))
15991602
continue;
16001603

1604+
/* be sure to advertise window change */
1605+
old_space = READ_ONCE(msk->old_wspace);
1606+
if ((tcp_space(sk) - old_space) >= old_space)
1607+
mptcp_send_ack(msk, false);
1608+
16011609
/* only the master socket status is relevant here. The exit
16021610
* conditions mirror closely tcp_recvmsg()
16031611
*/
@@ -1650,7 +1658,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
16501658
/* .. race-breaker: ssk might have gotten new data
16511659
* after last __mptcp_move_skbs() returned false.
16521660
*/
1653-
if (unlikely(__mptcp_move_skbs(msk)))
1661+
if (unlikely(__mptcp_move_skbs(msk, 0)))
16541662
set_bit(MPTCP_DATA_READY, &msk->flags);
16551663
} else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) {
16561664
/* data to read but mptcp_wait_data() cleared DATA_READY */
@@ -1881,7 +1889,6 @@ static void mptcp_worker(struct work_struct *work)
18811889
if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
18821890
__mptcp_close_subflow(msk);
18831891

1884-
__mptcp_move_skbs(msk);
18851892
if (mptcp_send_head(sk))
18861893
mptcp_push_pending(sk, 0);
18871894

@@ -1965,6 +1972,7 @@ static int __mptcp_init_sock(struct sock *sk)
19651972
msk->out_of_order_queue = RB_ROOT;
19661973
msk->first_pending = NULL;
19671974

1975+
msk->ack_hint = NULL;
19681976
msk->first = NULL;
19691977
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
19701978

@@ -2500,8 +2508,7 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname,
25002508
return -EOPNOTSUPP;
25012509
}
25022510

2503-
#define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \
2504-
TCPF_WRITE_TIMER_DEFERRED)
2511+
#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED)
25052512

25062513
/* this is very alike tcp_release_cb() but we must handle differently a
25072514
* different set of events
@@ -2519,16 +2526,6 @@ static void mptcp_release_cb(struct sock *sk)
25192526

25202527
sock_release_ownership(sk);
25212528

2522-
if (flags & TCPF_DELACK_TIMER_DEFERRED) {
2523-
struct mptcp_sock *msk = mptcp_sk(sk);
2524-
struct sock *ssk;
2525-
2526-
ssk = mptcp_subflow_recv_lookup(msk);
2527-
if (!ssk || sk->sk_state == TCP_CLOSE ||
2528-
!schedule_work(&msk->work))
2529-
__sock_put(sk);
2530-
}
2531-
25322529
if (flags & TCPF_WRITE_TIMER_DEFERRED) {
25332530
mptcp_retransmit_handler(sk);
25342531
__sock_put(sk);

net/mptcp/protocol.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,17 +220,20 @@ struct mptcp_sock {
220220
u64 rcv_data_fin_seq;
221221
struct sock *last_snd;
222222
int snd_burst;
223+
int old_wspace;
223224
atomic64_t snd_una;
224225
atomic64_t wnd_end;
225226
unsigned long timer_ival;
226227
u32 token;
228+
int rmem_pending;
227229
unsigned long flags;
228230
bool can_ack;
229231
bool fully_established;
230232
bool rcv_data_fin;
231233
bool snd_data_fin_enable;
232234
bool use_64bit_ack; /* Set when we received a 64-bit DSN */
233235
spinlock_t join_list_lock;
236+
struct sock *ack_hint;
234237
struct work_struct work;
235238
struct sk_buff *ooo_last_skb;
236239
struct rb_root out_of_order_queue;
@@ -258,6 +261,11 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
258261
return (struct mptcp_sock *)sk;
259262
}
260263

264+
static inline int __mptcp_space(const struct sock *sk)
265+
{
266+
return tcp_space(sk) + READ_ONCE(mptcp_sk(sk)->rmem_pending);
267+
}
268+
261269
static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
262270
{
263271
const struct mptcp_sock *msk = mptcp_sk(sk);

net/mptcp/subflow.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -850,8 +850,6 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
850850
sk_eat_skb(ssk, skb);
851851
if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
852852
subflow->map_valid = 0;
853-
if (incr)
854-
tcp_cleanup_rbuf(ssk, incr);
855853
}
856854

857855
static bool subflow_check_data_avail(struct sock *ssk)
@@ -973,7 +971,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space)
973971
const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
974972
const struct sock *sk = subflow->conn;
975973

976-
*space = tcp_space(sk);
974+
*space = __mptcp_space(sk);
977975
*full_space = tcp_full_space(sk);
978976
}
979977

0 commit comments

Comments
 (0)