Skip to content

Commit e26925e

Browse files
committed
Merge branch 'tcp-TCP-TS-option-use-1-ms-clock'
Eric Dumazet says: ==================== tcp: TCP TS option use 1 ms clock TCP Timestamps option is defined in RFC 7323 Traditionally on linux, it has been tied to the internal 'jiffy' variable, because it had been a cheap and good enough generator. Unfortunately some distros use HZ=250 or even HZ=100 leading to not very useful TCP timestamps. For TCP flows in the DC, Google has used usec resolution for more than two years with great success [1]. RCVBUF autotuning is more precise. This series converts tp->tcp_mstamp to a plain u64 value storing a 1 usec TCP clock. This choice will allow us to upstream the 1 usec TS option as discussed in IETF 97. Kathleen Nichols [2] and others advocate for 1ms TS clocks for network analysis. (1ms being the lowest value supported by RFC 7323.) [1] https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf [2] http://netseminar.stanford.edu/seminars/02_02_17.pdf ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 9d4f97f + 9a568de commit e26925e

24 files changed

+259
-274
lines changed

include/linux/skbuff.h

Lines changed: 1 addition & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -506,66 +506,6 @@ typedef unsigned int sk_buff_data_t;
506506
typedef unsigned char *sk_buff_data_t;
507507
#endif
508508

509-
/**
510-
* struct skb_mstamp - multi resolution time stamps
511-
* @stamp_us: timestamp in us resolution
512-
* @stamp_jiffies: timestamp in jiffies
513-
*/
514-
struct skb_mstamp {
515-
union {
516-
u64 v64;
517-
struct {
518-
u32 stamp_us;
519-
u32 stamp_jiffies;
520-
};
521-
};
522-
};
523-
524-
/**
525-
* skb_mstamp_get - get current timestamp
526-
* @cl: place to store timestamps
527-
*/
528-
static inline void skb_mstamp_get(struct skb_mstamp *cl)
529-
{
530-
u64 val = local_clock();
531-
532-
do_div(val, NSEC_PER_USEC);
533-
cl->stamp_us = (u32)val;
534-
cl->stamp_jiffies = (u32)jiffies;
535-
}
536-
537-
/**
538-
* skb_mstamp_delta - compute the difference in usec between two skb_mstamp
539-
* @t1: pointer to newest sample
540-
* @t0: pointer to oldest sample
541-
*/
542-
static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
543-
const struct skb_mstamp *t0)
544-
{
545-
s32 delta_us = t1->stamp_us - t0->stamp_us;
546-
u32 delta_jiffies = t1->stamp_jiffies - t0->stamp_jiffies;
547-
548-
/* If delta_us is negative, this might be because interval is too big,
549-
* or local_clock() drift is too big : fallback using jiffies.
550-
*/
551-
if (delta_us <= 0 ||
552-
delta_jiffies >= (INT_MAX / (USEC_PER_SEC / HZ)))
553-
554-
delta_us = jiffies_to_usecs(delta_jiffies);
555-
556-
return delta_us;
557-
}
558-
559-
static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
560-
const struct skb_mstamp *t0)
561-
{
562-
s32 diff = t1->stamp_jiffies - t0->stamp_jiffies;
563-
564-
if (!diff)
565-
diff = t1->stamp_us - t0->stamp_us;
566-
return diff > 0;
567-
}
568-
569509
/**
570510
* struct sk_buff - socket buffer
571511
* @next: Next buffer in list
@@ -646,7 +586,7 @@ struct sk_buff {
646586

647587
union {
648588
ktime_t tstamp;
649-
struct skb_mstamp skb_mstamp;
589+
u64 skb_mstamp;
650590
};
651591
};
652592
struct rb_node rbnode; /* used in netem & tcp stack */

include/linux/tcp.h

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ struct tcp_request_sock_ops;
123123
struct tcp_request_sock {
124124
struct inet_request_sock req;
125125
const struct tcp_request_sock_ops *af_specific;
126-
struct skb_mstamp snt_synack; /* first SYNACK sent time */
126+
u64 snt_synack; /* first SYNACK sent time */
127127
bool tfo_listener;
128128
u32 txhash;
129129
u32 rcv_isn;
@@ -211,7 +211,7 @@ struct tcp_sock {
211211

212212
/* Information of the most recently (s)acked skb */
213213
struct tcp_rack {
214-
struct skb_mstamp mstamp; /* (Re)sent time of the skb */
214+
u64 mstamp; /* (Re)sent time of the skb */
215215
u32 rtt_us; /* Associated RTT */
216216
u32 end_seq; /* Ending TCP sequence of the skb */
217217
u8 advanced; /* mstamp advanced since last lost marking */
@@ -240,7 +240,7 @@ struct tcp_sock {
240240
u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */
241241

242242
/* RTT measurement */
243-
struct skb_mstamp tcp_mstamp; /* most recent packet received/sent */
243+
u64 tcp_mstamp; /* most recent packet received/sent */
244244
u32 srtt_us; /* smoothed round trip time << 3 in usecs */
245245
u32 mdev_us; /* medium deviation */
246246
u32 mdev_max_us; /* maximal mdev for the last rtt period */
@@ -280,8 +280,8 @@ struct tcp_sock {
280280
u32 delivered; /* Total data packets delivered incl. rexmits */
281281
u32 lost; /* Total data packets lost incl. rexmits */
282282
u32 app_limited; /* limited until "delivered" reaches this val */
283-
struct skb_mstamp first_tx_mstamp; /* start of window send phase */
284-
struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
283+
u64 first_tx_mstamp; /* start of window send phase */
284+
u64 delivered_mstamp; /* time we reached "delivered" */
285285
u32 rate_delivered; /* saved rate sample: packets delivered */
286286
u32 rate_interval_us; /* saved rate sample: time elapsed */
287287

@@ -335,16 +335,16 @@ struct tcp_sock {
335335

336336
/* Receiver side RTT estimation */
337337
struct {
338-
u32 rtt_us;
339-
u32 seq;
340-
struct skb_mstamp time;
338+
u32 rtt_us;
339+
u32 seq;
340+
u64 time;
341341
} rcv_rtt_est;
342342

343343
/* Receiver queue space */
344344
struct {
345-
int space;
346-
u32 seq;
347-
struct skb_mstamp time;
345+
int space;
346+
u32 seq;
347+
u64 time;
348348
} rcvq_space;
349349

350350
/* TCP-specific MTU probe information. */

include/net/tcp.h

Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,7 @@ static inline u32 tcp_cookie_time(void)
519519
u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
520520
u16 *mssp);
521521
__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
522-
__u32 cookie_init_timestamp(struct request_sock *req);
522+
u64 cookie_init_timestamp(struct request_sock *req);
523523
bool cookie_timestamp_decode(struct tcp_options_received *opt);
524524
bool cookie_ecn_ok(const struct tcp_options_received *opt,
525525
const struct net *net, const struct dst_entry *dst);
@@ -700,17 +700,61 @@ u32 __tcp_select_window(struct sock *sk);
700700

701701
void tcp_send_window_probe(struct sock *sk);
702702

703-
/* TCP timestamps are only 32-bits, this causes a slight
704-
* complication on 64-bit systems since we store a snapshot
705-
* of jiffies in the buffer control blocks below. We decided
706-
* to use only the low 32-bits of jiffies and hide the ugly
707-
* casts with the following macro.
703+
/* TCP uses 32bit jiffies to save some space.
704+
* Note that this is different from tcp_time_stamp, which
705+
* historically has been the same until linux-4.13.
708706
*/
709-
#define tcp_time_stamp ((__u32)(jiffies))
707+
#define tcp_jiffies32 ((u32)jiffies)
708+
709+
/*
710+
* Deliver a 32bit value for TCP timestamp option (RFC 7323)
711+
* It is no longer tied to jiffies, but to 1 ms clock.
712+
* Note: double check if you want to use tcp_jiffies32 instead of this.
713+
*/
714+
#define TCP_TS_HZ 1000
715+
716+
static inline u64 tcp_clock_ns(void)
717+
{
718+
return local_clock();
719+
}
720+
721+
static inline u64 tcp_clock_us(void)
722+
{
723+
return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
724+
}
725+
726+
/* This should only be used in contexts where tp->tcp_mstamp is up to date */
727+
static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
728+
{
729+
return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
730+
}
731+
732+
/* Could use tcp_clock_us() / 1000, but this version uses a single divide */
733+
static inline u32 tcp_time_stamp_raw(void)
734+
{
735+
return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ);
736+
}
737+
738+
739+
/* Refresh 1us clock of a TCP socket,
740+
* ensuring monotically increasing values.
741+
*/
742+
static inline void tcp_mstamp_refresh(struct tcp_sock *tp)
743+
{
744+
u64 val = tcp_clock_us();
745+
746+
if (val > tp->tcp_mstamp)
747+
tp->tcp_mstamp = val;
748+
}
749+
750+
static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
751+
{
752+
return max_t(s64, t1 - t0, 0);
753+
}
710754

711755
static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
712756
{
713-
return skb->skb_mstamp.stamp_jiffies;
757+
return div_u64(skb->skb_mstamp, USEC_PER_SEC / TCP_TS_HZ);
714758
}
715759

716760

@@ -775,9 +819,9 @@ struct tcp_skb_cb {
775819
/* pkts S/ACKed so far upon tx of skb, incl retrans: */
776820
__u32 delivered;
777821
/* start of send pipeline phase */
778-
struct skb_mstamp first_tx_mstamp;
822+
u64 first_tx_mstamp;
779823
/* when we reached the "delivered" count */
780-
struct skb_mstamp delivered_mstamp;
824+
u64 delivered_mstamp;
781825
} tx; /* only used for outgoing skbs */
782826
union {
783827
struct inet_skb_parm h4;
@@ -893,7 +937,7 @@ struct ack_sample {
893937
* A sample is invalid if "delivered" or "interval_us" is negative.
894938
*/
895939
struct rate_sample {
896-
struct skb_mstamp prior_mstamp; /* starting timestamp for interval */
940+
u64 prior_mstamp; /* starting timestamp for interval */
897941
u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
898942
s32 delivered; /* number of packets delivered over interval */
899943
long interval_us; /* time for tp->delivered to incr "delivered" */
@@ -1242,7 +1286,7 @@ static inline void tcp_slow_start_after_idle_check(struct sock *sk)
12421286
if (!sysctl_tcp_slow_start_after_idle || tp->packets_out ||
12431287
ca_ops->cong_control)
12441288
return;
1245-
delta = tcp_time_stamp - tp->lsndtime;
1289+
delta = tcp_jiffies32 - tp->lsndtime;
12461290
if (delta > inet_csk(sk)->icsk_rto)
12471291
tcp_cwnd_restart(sk, delta);
12481292
}
@@ -1304,8 +1348,8 @@ static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp)
13041348
{
13051349
const struct inet_connection_sock *icsk = &tp->inet_conn;
13061350

1307-
return min_t(u32, tcp_time_stamp - icsk->icsk_ack.lrcvtime,
1308-
tcp_time_stamp - tp->rcv_tstamp);
1351+
return min_t(u32, tcp_jiffies32 - icsk->icsk_ack.lrcvtime,
1352+
tcp_jiffies32 - tp->rcv_tstamp);
13091353
}
13101354

13111355
static inline int tcp_fin_time(const struct sock *sk)
@@ -1859,7 +1903,7 @@ void tcp_init(void);
18591903
/* tcp_recovery.c */
18601904
extern void tcp_rack_mark_lost(struct sock *sk);
18611905
extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
1862-
const struct skb_mstamp *xmit_time);
1906+
u64 xmit_time);
18631907
extern void tcp_rack_reo_timeout(struct sock *sk);
18641908

18651909
/*

net/dccp/ccids/ccid2.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
233233
{
234234
struct dccp_sock *dp = dccp_sk(sk);
235235
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
236-
const u32 now = ccid2_time_stamp;
236+
const u32 now = ccid2_jiffies32;
237237
struct ccid2_seq *next;
238238

239239
/* slow-start after idle periods (RFC 2581, RFC 2861) */
@@ -466,7 +466,7 @@ static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
466466
* The cleanest solution is to not use the ccid2s_sent field at all
467467
* and instead use DCCP timestamps: requires changes in other places.
468468
*/
469-
ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent);
469+
ccid2_rtt_estimator(sk, ccid2_jiffies32 - seqp->ccid2s_sent);
470470
}
471471

472472
static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
@@ -478,7 +478,7 @@ static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
478478
return;
479479
}
480480

481-
hc->tx_last_cong = ccid2_time_stamp;
481+
hc->tx_last_cong = ccid2_jiffies32;
482482

483483
hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U;
484484
hc->tx_ssthresh = max(hc->tx_cwnd, 2U);
@@ -731,7 +731,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
731731

732732
hc->tx_rto = DCCP_TIMEOUT_INIT;
733733
hc->tx_rpdupack = -1;
734-
hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_time_stamp;
734+
hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_jiffies32;
735735
hc->tx_cwnd_used = 0;
736736
setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire,
737737
(unsigned long)sk);

net/dccp/ccids/ccid2.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
* CCID-2 timestamping faces the same issues as TCP timestamping.
2828
* Hence we reuse/share as much of the code as possible.
2929
*/
30-
#define ccid2_time_stamp tcp_time_stamp
30+
#define ccid2_jiffies32 ((u32)jiffies)
3131

3232
/* NUMDUPACK parameter from RFC 4341, p. 6 */
3333
#define NUMDUPACK 3

net/ipv4/syncookies.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,10 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
6666
* Since subsequent timestamps use the normal tcp_time_stamp value, we
6767
* must make sure that the resulting initial timestamp is <= tcp_time_stamp.
6868
*/
69-
__u32 cookie_init_timestamp(struct request_sock *req)
69+
u64 cookie_init_timestamp(struct request_sock *req)
7070
{
7171
struct inet_request_sock *ireq;
72-
u32 ts, ts_now = tcp_time_stamp;
72+
u32 ts, ts_now = tcp_time_stamp_raw();
7373
u32 options = 0;
7474

7575
ireq = inet_rsk(req);
@@ -88,7 +88,7 @@ __u32 cookie_init_timestamp(struct request_sock *req)
8888
ts <<= TSBITS;
8989
ts |= options;
9090
}
91-
return ts;
91+
return (u64)ts * (USEC_PER_SEC / TCP_TS_HZ);
9292
}
9393

9494

@@ -343,7 +343,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
343343
ireq->wscale_ok = tcp_opt.wscale_ok;
344344
ireq->tstamp_ok = tcp_opt.saw_tstamp;
345345
req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
346-
treq->snt_synack.v64 = 0;
346+
treq->snt_synack = 0;
347347
treq->tfo_listener = false;
348348

349349
ireq->ir_iif = inet_request_bound_dev_if(sk, skb);

net/ipv4/tcp.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,7 @@ void tcp_init_sock(struct sock *sk)
386386

387387
icsk->icsk_rto = TCP_TIMEOUT_INIT;
388388
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
389-
minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U);
389+
minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
390390

391391
/* So many TCP implementations out there (incorrectly) count the
392392
* initial SYN frame in their delayed-ACK and congestion control
@@ -2706,7 +2706,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
27062706
if (!tp->repair)
27072707
err = -EPERM;
27082708
else
2709-
tp->tsoffset = val - tcp_time_stamp;
2709+
tp->tsoffset = val - tcp_time_stamp_raw();
27102710
break;
27112711
case TCP_REPAIR_WINDOW:
27122712
err = tcp_repair_set_window(tp, optval, optlen);
@@ -2757,7 +2757,7 @@ static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
27572757
for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
27582758
stats[i] = tp->chrono_stat[i - 1];
27592759
if (i == tp->chrono_type)
2760-
stats[i] += tcp_time_stamp - tp->chrono_start;
2760+
stats[i] += tcp_jiffies32 - tp->chrono_start;
27612761
stats[i] *= USEC_PER_SEC / HZ;
27622762
total += stats[i];
27632763
}
@@ -2841,7 +2841,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
28412841
info->tcpi_retrans = tp->retrans_out;
28422842
info->tcpi_fackets = tp->fackets_out;
28432843

2844-
now = tcp_time_stamp;
2844+
now = tcp_jiffies32;
28452845
info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
28462846
info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
28472847
info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
@@ -3072,7 +3072,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
30723072
break;
30733073

30743074
case TCP_TIMESTAMP:
3075-
val = tcp_time_stamp + tp->tsoffset;
3075+
val = tcp_time_stamp_raw() + tp->tsoffset;
30763076
break;
30773077
case TCP_NOTSENT_LOWAT:
30783078
val = tp->notsent_lowat;

0 commit comments

Comments
 (0)