Skip to content

Commit 2c47a65

Browse files
committed
Merge branch 'tcp-implement-SACK-compression'
Eric Dumazet says: ==================== tcp: implement SACK compression When TCP receives an out-of-order packet, it immediately sends a SACK packet, generating network load but also forcing the receiver to send 1-MSS pathological packets, increasing its RTX queue length/depth, and thus processing time. Wifi networks suffer from this aggressive behavior, but generally speaking, all these SACK packets add fuel to the fire when networks are under congestion. This patch series adds SACK compression, but the infrastructure could be leveraged to also compress ACK in the future. v2: Addressed Neal feedback. Added two sysctls to allow fine tuning, or even disabling the feature. v3: take rtt = min(srtt, rcv_rtt) as Yuchung suggested, because rcv_rtt can be over estimated for RPC (or sender limited) ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 64a2658 + 9c21d2f commit 2c47a65

File tree

12 files changed

+107
-9
lines changed

12 files changed

+107
-9
lines changed

Documentation/networking/ip-sysctl.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,19 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
525525
tcp_sack - BOOLEAN
526526
Enable select acknowledgments (SACKS).
527527

528+
tcp_comp_sack_delay_ns - LONG INTEGER
529+
TCP tries to reduce number of SACK sent, using a timer
530+
based on 5% of SRTT, capped by this sysctl, in nano seconds.
531+
The default is 1ms, based on TSO autosizing period.
532+
533+
Default : 1,000,000 ns (1 ms)
534+
535+
tcp_comp_sack_nr - INTEGER
536+
Max numer of SACK that can be compressed.
537+
Using 0 disables SACK compression.
538+
539+
Detault : 44
540+
528541
tcp_slow_start_after_idle - BOOLEAN
529542
If set, provide RFC2861 behavior and time out the congestion
530543
window after an idle period. An idle period is defined at

include/linux/tcp.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ struct tcp_sock {
218218
reord:1; /* reordering detected */
219219
} rack;
220220
u16 advmss; /* Advertised MSS */
221+
u8 compressed_ack;
221222
u32 chrono_start; /* Start time in jiffies of a TCP chrono */
222223
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
223224
u8 chrono_type:2, /* current chronograph type */
@@ -297,6 +298,7 @@ struct tcp_sock {
297298
u32 sacked_out; /* SACK'd packets */
298299

299300
struct hrtimer pacing_timer;
301+
struct hrtimer compressed_ack_timer;
300302

301303
/* from STCP, retrans queue hinting */
302304
struct sk_buff* lost_skb_hint;

include/net/netns/ipv4.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ struct netns_ipv4 {
160160
int sysctl_tcp_pacing_ca_ratio;
161161
int sysctl_tcp_wmem[3];
162162
int sysctl_tcp_rmem[3];
163+
int sysctl_tcp_comp_sack_nr;
164+
unsigned long sysctl_tcp_comp_sack_delay_ns;
163165
struct inet_timewait_death_row tcp_death_row;
164166
int sysctl_max_syn_backlog;
165167
int sysctl_tcp_fastopen;

include/net/tcp.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -559,7 +559,10 @@ void tcp_init_xmit_timers(struct sock *);
559559
static inline void tcp_clear_xmit_timers(struct sock *sk)
560560
{
561561
if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
562-
sock_put(sk);
562+
__sock_put(sk);
563+
564+
if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1)
565+
__sock_put(sk);
563566

564567
inet_csk_clear_xmit_timers(sk);
565568
}

include/uapi/linux/snmp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ enum
278278
LINUX_MIB_TCPMTUPSUCCESS, /* TCPMTUPSuccess */
279279
LINUX_MIB_TCPDELIVERED, /* TCPDelivered */
280280
LINUX_MIB_TCPDELIVEREDCE, /* TCPDeliveredCE */
281+
LINUX_MIB_TCPACKCOMPRESSED, /* TCPAckCompressed */
281282
__LINUX_MIB_MAX
282283
};
283284

net/ipv4/proc.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,7 @@ static const struct snmp_mib snmp4_net_list[] = {
298298
SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS),
299299
SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED),
300300
SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE),
301+
SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
301302
SNMP_MIB_SENTINEL
302303
};
303304

net/ipv4/sysctl_net_ipv4.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ static int tcp_syn_retries_min = 1;
4646
static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
4747
static int ip_ping_group_range_min[] = { 0, 0 };
4848
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
49+
static int comp_sack_nr_max = 255;
4950

5051
/* obsolete */
5152
static int sysctl_tcp_low_latency __read_mostly;
@@ -1151,6 +1152,22 @@ static struct ctl_table ipv4_net_table[] = {
11511152
.proc_handler = proc_dointvec_minmax,
11521153
.extra1 = &one,
11531154
},
1155+
{
1156+
.procname = "tcp_comp_sack_delay_ns",
1157+
.data = &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns,
1158+
.maxlen = sizeof(unsigned long),
1159+
.mode = 0644,
1160+
.proc_handler = proc_doulongvec_minmax,
1161+
},
1162+
{
1163+
.procname = "tcp_comp_sack_nr",
1164+
.data = &init_net.ipv4.sysctl_tcp_comp_sack_nr,
1165+
.maxlen = sizeof(int),
1166+
.mode = 0644,
1167+
.proc_handler = proc_dointvec_minmax,
1168+
.extra1 = &zero,
1169+
.extra2 = &comp_sack_nr_max,
1170+
},
11541171
{
11551172
.procname = "udp_rmem_min",
11561173
.data = &init_net.ipv4.sysctl_udp_rmem_min,

net/ipv4/tcp.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2595,6 +2595,7 @@ int tcp_disconnect(struct sock *sk, int flags)
25952595
dst_release(sk->sk_rx_dst);
25962596
sk->sk_rx_dst = NULL;
25972597
tcp_saved_syn_free(tp);
2598+
tp->compressed_ack = 0;
25982599

25992600
/* Clean up fastopen related fields */
26002601
tcp_free_fastopen_req(tp);

net/ipv4/tcp_input.c

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4249,6 +4249,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
42494249
* If the sack array is full, forget about the last one.
42504250
*/
42514251
if (this_sack >= TCP_NUM_SACKS) {
4252+
if (tp->compressed_ack)
4253+
tcp_send_ack(sk);
42524254
this_sack--;
42534255
tp->rx_opt.num_sacks--;
42544256
sp--;
@@ -4715,8 +4717,6 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
47154717
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
47164718
goto out_of_window;
47174719

4718-
tcp_enter_quickack_mode(sk);
4719-
47204720
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
47214721
/* Partial packet, seq < rcv_next < end_seq */
47224722
SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
@@ -5083,6 +5083,7 @@ static inline void tcp_data_snd_check(struct sock *sk)
50835083
static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
50845084
{
50855085
struct tcp_sock *tp = tcp_sk(sk);
5086+
unsigned long rtt, delay;
50865087

50875088
/* More than one full frame received... */
50885089
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
@@ -5094,15 +5095,36 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
50945095
(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
50955096
__tcp_select_window(sk) >= tp->rcv_wnd)) ||
50965097
/* We ACK each frame or... */
5097-
tcp_in_quickack_mode(sk) ||
5098-
/* We have out of order data. */
5099-
(ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) {
5100-
/* Then ack it now */
5098+
tcp_in_quickack_mode(sk)) {
5099+
send_now:
51015100
tcp_send_ack(sk);
5102-
} else {
5103-
/* Else, send delayed ack. */
5101+
return;
5102+
}
5103+
5104+
if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
51045105
tcp_send_delayed_ack(sk);
5106+
return;
51055107
}
5108+
5109+
if (!tcp_is_sack(tp) ||
5110+
tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
5111+
goto send_now;
5112+
tp->compressed_ack++;
5113+
5114+
if (hrtimer_is_queued(&tp->compressed_ack_timer))
5115+
return;
5116+
5117+
/* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
5118+
5119+
rtt = tp->rcv_rtt_est.rtt_us;
5120+
if (tp->srtt_us && tp->srtt_us < rtt)
5121+
rtt = tp->srtt_us;
5122+
5123+
delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
5124+
rtt * (NSEC_PER_USEC >> 3)/20);
5125+
sock_hold(sk);
5126+
hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
5127+
HRTIMER_MODE_REL_PINNED_SOFT);
51065128
}
51075129

51085130
static inline void tcp_ack_snd_check(struct sock *sk)

net/ipv4/tcp_ipv4.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2572,6 +2572,8 @@ static int __net_init tcp_sk_init(struct net *net)
25722572
init_net.ipv4.sysctl_tcp_wmem,
25732573
sizeof(init_net.ipv4.sysctl_tcp_wmem));
25742574
}
2575+
net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2576+
net->ipv4.sysctl_tcp_comp_sack_nr = 44;
25752577
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
25762578
spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
25772579
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;

0 commit comments

Comments
 (0)