Skip to content

Commit 076433b

Browse files
edumazetdavem330
authored andcommitted
net_sched: sch_fq: add fast path for mostly idle qdisc
TCQ_F_CAN_BYPASS can be used by few qdiscs. Idea is that if we queue a packet to an empty qdisc, following dequeue() would pick it immediately. FQ can not use the generic TCQ_F_CAN_BYPASS code, because some additional checks need to be performed. This patch adds a similar fast path to FQ. Most of the time, qdisc is not throttled, and many packets can avoid bringing/touching at least four cache lines, and consuming 128bytes of memory to store the state of a flow. After this patch, netperf can send UDP packets about 13 % faster, and pktgen goes 30 % faster (when FQ is in the way), on a fast NIC. TCP traffic is also improved, thanks to a reduction of cache line misses. I have measured a 5 % increase of throughput on a tcp_rr intensive workload. tc -s -d qd sh dev eth1 ... qdisc fq 8004: parent 1:2 limit 10000p flow_limit 100p buckets 1024 orphan_mask 1023 quantum 3028b initial_quantum 15140b low_rate_threshold 550Kbit refill_delay 40ms timer_slack 10us horizon 10s horizon_drop Sent 5646784384 bytes 1985161 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 flows 122 (inactive 122 throttled 0) gc 0 highprio 0 fastpath 659990 throttled 27762 latency 8.57us Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent ee9af4e commit 076433b

File tree

2 files changed

+92
-37
lines changed

2 files changed

+92
-37
lines changed

include/uapi/linux/pkt_sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -962,6 +962,7 @@ struct tc_fq_qd_stats {
962962
__u64 ce_mark; /* packets above ce_threshold */
963963
__u64 horizon_drops;
964964
__u64 horizon_caps;
965+
__u64 fastpath_packets;
965966
};
966967

967968
/* Heavy-Hitter Filter */

net/sched/sch_fq.c

Lines changed: 91 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
/*
33
* net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
44
*
5-
* Copyright (C) 2013-2015 Eric Dumazet <[email protected]>
5+
* Copyright (C) 2013-2023 Eric Dumazet <[email protected]>
66
*
77
* Meant to be mostly used for locally generated traffic :
88
* Fast classification depends on skb->sk being set before reaching us.
@@ -73,7 +73,13 @@ struct fq_flow {
7373
struct sk_buff *tail; /* last skb in the list */
7474
unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */
7575
};
76-
struct rb_node fq_node; /* anchor in fq_root[] trees */
76+
union {
77+
struct rb_node fq_node; /* anchor in fq_root[] trees */
78+
/* Following field is only used for q->internal,
79+
* because q->internal is not hashed in fq_root[]
80+
*/
81+
u64 stat_fastpath_packets;
82+
};
7783
struct sock *sk;
7884
u32 socket_hash; /* sk_hash */
7985
int qlen; /* number of packets in flow queue */
@@ -134,7 +140,7 @@ struct fq_sched_data {
134140

135141
/* Seldom used fields. */
136142

137-
u64 stat_internal_packets;
143+
u64 stat_internal_packets; /* aka highprio */
138144
u64 stat_ce_mark;
139145
u64 stat_horizon_drops;
140146
u64 stat_horizon_caps;
@@ -266,17 +272,64 @@ static void fq_gc(struct fq_sched_data *q,
266272
kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
267273
}
268274

269-
static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
275+
/* Fast path can be used if :
276+
* 1) Packet tstamp is in the past.
277+
* 2) FQ qlen == 0 OR
278+
* (no flow is currently eligible for transmit,
279+
* AND fast path queue has less than 8 packets)
280+
* 3) No SO_MAX_PACING_RATE on the socket (if any).
281+
* 4) No @maxrate attribute on this qdisc,
282+
*
283+
* FQ can not use generic TCQ_F_CAN_BYPASS infrastructure.
284+
*/
285+
static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb)
286+
{
287+
const struct fq_sched_data *q = qdisc_priv(sch);
288+
const struct sock *sk;
289+
290+
if (fq_skb_cb(skb)->time_to_send > q->ktime_cache)
291+
return false;
292+
293+
if (sch->q.qlen != 0) {
294+
/* Even if some packets are stored in this qdisc,
295+
* we can still enable fast path if all of them are
296+
* scheduled in the future (ie no flows are eligible)
297+
* or in the fast path queue.
298+
*/
299+
if (q->flows != q->inactive_flows + q->throttled_flows)
300+
return false;
301+
302+
/* Do not allow fast path queue to explode, we want Fair Queue mode
303+
* under pressure.
304+
*/
305+
if (q->internal.qlen >= 8)
306+
return false;
307+
}
308+
309+
sk = skb->sk;
310+
if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) &&
311+
sk->sk_max_pacing_rate != ~0UL)
312+
return false;
313+
314+
if (q->flow_max_rate != ~0UL)
315+
return false;
316+
317+
return true;
318+
}
319+
320+
static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb)
270321
{
322+
struct fq_sched_data *q = qdisc_priv(sch);
271323
struct rb_node **p, *parent;
272324
struct sock *sk = skb->sk;
273325
struct rb_root *root;
274326
struct fq_flow *f;
275327

276328
/* warning: no starvation prevention... */
277-
if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
329+
if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) {
330+
q->stat_internal_packets++; /* highprio packet */
278331
return &q->internal;
279-
332+
}
280333
/* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
281334
* or a listener (SYNCOOKIE mode)
282335
* 1) request sockets are not full blown,
@@ -307,6 +360,11 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
307360
sk = (struct sock *)((hash << 1) | 1UL);
308361
}
309362

363+
if (fq_fastpath_check(sch, skb)) {
364+
q->internal.stat_fastpath_packets++;
365+
return &q->internal;
366+
}
367+
310368
root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
311369

312370
if (q->flows >= (2U << q->fq_trees_log) &&
@@ -402,12 +460,8 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
402460
static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
403461
struct sk_buff *skb)
404462
{
405-
struct fq_sched_data *q = qdisc_priv(sch);
406-
407463
fq_erase_head(sch, flow, skb);
408464
skb_mark_not_on_list(skb);
409-
if (--flow->qlen == 0)
410-
q->inactive_flows++;
411465
qdisc_qstats_backlog_dec(sch, skb);
412466
sch->q.qlen--;
413467
}
@@ -459,49 +513,45 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
459513
if (unlikely(sch->q.qlen >= sch->limit))
460514
return qdisc_drop(skb, sch, to_free);
461515

516+
q->ktime_cache = ktime_get_ns();
462517
if (!skb->tstamp) {
463-
fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns();
518+
fq_skb_cb(skb)->time_to_send = q->ktime_cache;
464519
} else {
465-
/* Check if packet timestamp is too far in the future.
466-
* Try first if our cached value, to avoid ktime_get_ns()
467-
* cost in most cases.
468-
*/
520+
/* Check if packet timestamp is too far in the future. */
469521
if (fq_packet_beyond_horizon(skb, q)) {
470-
/* Refresh our cache and check another time */
471-
q->ktime_cache = ktime_get_ns();
472-
if (fq_packet_beyond_horizon(skb, q)) {
473-
if (q->horizon_drop) {
522+
if (q->horizon_drop) {
474523
q->stat_horizon_drops++;
475524
return qdisc_drop(skb, sch, to_free);
476-
}
477-
q->stat_horizon_caps++;
478-
skb->tstamp = q->ktime_cache + q->horizon;
479525
}
526+
q->stat_horizon_caps++;
527+
skb->tstamp = q->ktime_cache + q->horizon;
480528
}
481529
fq_skb_cb(skb)->time_to_send = skb->tstamp;
482530
}
483531

484-
f = fq_classify(skb, q);
485-
if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
486-
q->stat_flows_plimit++;
487-
return qdisc_drop(skb, sch, to_free);
488-
}
532+
f = fq_classify(sch, skb);
489533

490-
if (f->qlen++ == 0)
491-
q->inactive_flows--;
492-
qdisc_qstats_backlog_inc(sch, skb);
493-
if (fq_flow_is_detached(f)) {
494-
fq_flow_add_tail(&q->new_flows, f);
495-
if (time_after(jiffies, f->age + q->flow_refill_delay))
496-
f->credit = max_t(u32, f->credit, q->quantum);
534+
if (f != &q->internal) {
535+
if (unlikely(f->qlen >= q->flow_plimit)) {
536+
q->stat_flows_plimit++;
537+
return qdisc_drop(skb, sch, to_free);
538+
}
539+
540+
if (fq_flow_is_detached(f)) {
541+
fq_flow_add_tail(&q->new_flows, f);
542+
if (time_after(jiffies, f->age + q->flow_refill_delay))
543+
f->credit = max_t(u32, f->credit, q->quantum);
544+
}
545+
546+
if (f->qlen == 0)
547+
q->inactive_flows--;
497548
}
498549

550+
f->qlen++;
499551
/* Note: this overwrites f->age */
500552
flow_queue_add(f, skb);
501553

502-
if (unlikely(f == &q->internal)) {
503-
q->stat_internal_packets++;
504-
}
554+
qdisc_qstats_backlog_inc(sch, skb);
505555
sch->q.qlen++;
506556

507557
return NET_XMIT_SUCCESS;
@@ -549,6 +599,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
549599

550600
skb = fq_peek(&q->internal);
551601
if (unlikely(skb)) {
602+
q->internal.qlen--;
552603
fq_dequeue_skb(sch, &q->internal, skb);
553604
goto out;
554605
}
@@ -592,6 +643,8 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
592643
INET_ECN_set_ce(skb);
593644
q->stat_ce_mark++;
594645
}
646+
if (--f->qlen == 0)
647+
q->inactive_flows++;
595648
fq_dequeue_skb(sch, f, skb);
596649
} else {
597650
head->first = f->next;
@@ -1024,6 +1077,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
10241077

10251078
st.gc_flows = q->stat_gc_flows;
10261079
st.highprio_packets = q->stat_internal_packets;
1080+
st.fastpath_packets = q->internal.stat_fastpath_packets;
10271081
st.tcp_retrans = 0;
10281082
st.throttled = q->stat_throttled;
10291083
st.flows_plimit = q->stat_flows_plimit;

0 commit comments

Comments
 (0)