Skip to content

Commit 22a9202

Browse files
committed
sched_ext: Implement tickless support
Allow BPF schedulers to indicate tickless operation by setting p->scx.slice to SCX_SLICE_INF. A CPU whose current task has infinte slice goes into tickless operation. scx_central is updated to use tickless operations for all tasks and instead use a BPF timer to expire slices. This also uses the SCX_ENQ_PREEMPT and task state tracking added by the previous patches. Currently, there is no way to pin the timer on the central CPU, so it may end up on one of the worker CPUs; however, outside of that, the worker CPUs can go tickless both while running sched_ext tasks and idling. With schbench running, scx_central shows: root@test ~# grep ^LOC /proc/interrupts; sleep 10; grep ^LOC /proc/interrupts LOC: 142024 656 664 449 Local timer interrupts LOC: 161663 663 665 449 Local timer interrupts Without it: root@test ~ [SIGINT]# grep ^LOC /proc/interrupts; sleep 10; grep ^LOC /proc/interrupts LOC: 188778 3142 3793 3993 Local timer interrupts LOC: 198993 5314 6323 6438 Local timer interrupts While scx_central itself is too barebone to be useful as a production scheduler, a more featureful central scheduler can be built using the same approach. Google's experience shows that such an approach can have significant benefits for certain applications such as VM hosting. v4: Allow operation even if BPF_F_TIMER_CPU_PIN is not available. v3: Pin the central scheduler's timer on the central_cpu using BPF_F_TIMER_CPU_PIN. v2: Convert to BPF inline iterators. Signed-off-by: Tejun Heo <[email protected]> Reviewed-by: David Vernet <[email protected]> Acked-by: Josh Don <[email protected]> Acked-by: Hao Luo <[email protected]> Acked-by: Barret Rhoden <[email protected]>
1 parent 1c29f85 commit 22a9202

File tree

7 files changed

+242
-13
lines changed

7 files changed

+242
-13
lines changed

include/linux/sched/ext.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ enum scx_public_consts {
1616
SCX_OPS_NAME_LEN = 128,
1717

1818
SCX_SLICE_DFL = 20 * 1000000, /* 20ms */
19+
SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */
1920
};
2021

2122
/*

kernel/sched/core.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1256,11 +1256,14 @@ bool sched_can_stop_tick(struct rq *rq)
12561256
return true;
12571257

12581258
/*
1259-
* If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
1260-
* if there's more than one we need the tick for involuntary
1261-
* preemption.
1259+
* If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks
1260+
* left. For CFS, if there's more than one we need the tick for
1261+
* involuntary preemption. For SCX, ask.
12621262
*/
1263-
if (rq->nr_running > 1)
1263+
if (!scx_switched_all() && rq->nr_running > 1)
1264+
return false;
1265+
1266+
if (scx_enabled() && !scx_can_stop_tick(rq))
12641267
return false;
12651268

12661269
/*

kernel/sched/ext.c

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1086,7 +1086,8 @@ static void update_curr_scx(struct rq *rq)
10861086
account_group_exec_runtime(curr, delta_exec);
10871087
cgroup_account_cputime(curr, delta_exec);
10881088

1089-
curr->scx.slice -= min(curr->scx.slice, delta_exec);
1089+
if (curr->scx.slice != SCX_SLICE_INF)
1090+
curr->scx.slice -= min(curr->scx.slice, delta_exec);
10901091
}
10911092

10921093
static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
@@ -2093,6 +2094,28 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
20932094
SCX_CALL_OP(SCX_KF_REST, running, p);
20942095

20952096
clr_task_runnable(p, true);
2097+
2098+
/*
2099+
* @p is getting newly scheduled or got kicked after someone updated its
2100+
* slice. Refresh whether tick can be stopped. See scx_can_stop_tick().
2101+
*/
2102+
if ((p->scx.slice == SCX_SLICE_INF) !=
2103+
(bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
2104+
if (p->scx.slice == SCX_SLICE_INF)
2105+
rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
2106+
else
2107+
rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
2108+
2109+
sched_update_tick_dependency(rq);
2110+
2111+
/*
2112+
* For now, let's refresh the load_avgs just when transitioning
2113+
* in and out of nohz. In the future, we might want to add a
2114+
* mechanism which calls the following periodically on
2115+
* tick-stopped CPUs.
2116+
*/
2117+
update_other_load_avgs(rq);
2118+
}
20962119
}
20972120

20982121
static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
@@ -2818,6 +2841,26 @@ int scx_check_setscheduler(struct task_struct *p, int policy)
28182841
return 0;
28192842
}
28202843

2844+
#ifdef CONFIG_NO_HZ_FULL
2845+
bool scx_can_stop_tick(struct rq *rq)
2846+
{
2847+
struct task_struct *p = rq->curr;
2848+
2849+
if (scx_ops_bypassing())
2850+
return false;
2851+
2852+
if (p->sched_class != &ext_sched_class)
2853+
return true;
2854+
2855+
/*
2856+
* @rq can dispatch from different DSQs, so we can't tell whether it
2857+
* needs the tick or not by looking at nr_running. Allow stopping ticks
2858+
* iff the BPF scheduler indicated so. See set_next_task_scx().
2859+
*/
2860+
return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
2861+
}
2862+
#endif
2863+
28212864
/*
28222865
* Omitted operations:
28232866
*
@@ -3120,6 +3163,9 @@ static void scx_ops_bypass(bool bypass)
31203163
}
31213164

31223165
rq_unlock_irqrestore(rq, &rf);
3166+
3167+
/* kick to restore ticks */
3168+
resched_cpu(cpu);
31233169
}
31243170
}
31253171

@@ -4576,7 +4622,9 @@ __bpf_kfunc_start_defs();
45764622
* BPF locks (in the future when BPF introduces more flexible locking).
45774623
*
45784624
* @p is allowed to run for @slice. The scheduling path is triggered on slice
4579-
* exhaustion. If zero, the current residual slice is maintained.
4625+
* exhaustion. If zero, the current residual slice is maintained. If
4626+
* %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
4627+
* scx_bpf_kick_cpu() to trigger scheduling.
45804628
*/
45814629
__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
45824630
u64 enq_flags)

kernel/sched/ext.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ void scx_pre_fork(struct task_struct *p);
3535
int scx_fork(struct task_struct *p);
3636
void scx_post_fork(struct task_struct *p);
3737
void scx_cancel_fork(struct task_struct *p);
38+
bool scx_can_stop_tick(struct rq *rq);
3839
int scx_check_setscheduler(struct task_struct *p, int policy);
3940
bool task_should_scx(struct task_struct *p);
4041
void init_sched_ext_class(void);
@@ -73,6 +74,7 @@ static inline void scx_pre_fork(struct task_struct *p) {}
7374
static inline int scx_fork(struct task_struct *p) { return 0; }
7475
static inline void scx_post_fork(struct task_struct *p) {}
7576
static inline void scx_cancel_fork(struct task_struct *p) {}
77+
static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
7678
static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
7779
static inline bool task_on_scx(const struct task_struct *p) { return false; }
7880
static inline void init_sched_ext_class(void) {}

kernel/sched/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -727,6 +727,7 @@ struct cfs_rq {
727727
/* scx_rq->flags, protected by the rq lock */
728728
enum scx_rq_flags {
729729
SCX_RQ_BALANCING = 1 << 1,
730+
SCX_RQ_CAN_STOP_TICK = 1 << 2,
730731
};
731732

732733
struct scx_rq {

tools/sched_ext/scx_central.bpf.c

Lines changed: 153 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,26 @@
1313
* through per-CPU BPF queues. The current design is chosen to maximally
1414
* utilize and verify various SCX mechanisms such as LOCAL_ON dispatching.
1515
*
16-
* b. Preemption
16+
* b. Tickless operation
17+
*
18+
* All tasks are dispatched with the infinite slice which allows stopping the
19+
* ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full
20+
* parameter. The tickless operation can be observed through
21+
* /proc/interrupts.
22+
*
23+
* Periodic switching is enforced by a periodic timer checking all CPUs and
24+
* preempting them as necessary. Unfortunately, BPF timer currently doesn't
25+
* have a way to pin to a specific CPU, so the periodic timer isn't pinned to
26+
* the central CPU.
27+
*
28+
* c. Preemption
29+
*
30+
* Kthreads are unconditionally queued to the head of a matching local dsq
31+
* and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always
32+
* prioritized over user threads, which is required for ensuring forward
33+
* progress as e.g. the periodic timer may run on a ksoftirqd and if the
34+
* ksoftirqd gets starved by a user thread, there may not be anything else to
35+
* vacate that user thread.
1736
*
1837
* SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the
1938
* next tasks.
@@ -32,14 +51,17 @@ char _license[] SEC("license") = "GPL";
3251

3352
enum {
3453
FALLBACK_DSQ_ID = 0,
54+
MS_TO_NS = 1000LLU * 1000,
55+
TIMER_INTERVAL_NS = 1 * MS_TO_NS,
3556
};
3657

3758
const volatile s32 central_cpu;
3859
const volatile u32 nr_cpu_ids = 1; /* !0 for veristat, set during init */
3960
const volatile u64 slice_ns = SCX_SLICE_DFL;
4061

62+
bool timer_pinned = true;
4163
u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
42-
u64 nr_dispatches, nr_mismatches, nr_retries;
64+
u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries;
4365
u64 nr_overflows;
4466

4567
UEI_DEFINE(uei);
@@ -52,6 +74,23 @@ struct {
5274

5375
/* can't use percpu map due to bad lookups */
5476
bool RESIZABLE_ARRAY(data, cpu_gimme_task);
77+
u64 RESIZABLE_ARRAY(data, cpu_started_at);
78+
79+
struct central_timer {
80+
struct bpf_timer timer;
81+
};
82+
83+
struct {
84+
__uint(type, BPF_MAP_TYPE_ARRAY);
85+
__uint(max_entries, 1);
86+
__type(key, u32);
87+
__type(value, struct central_timer);
88+
} central_timer SEC(".maps");
89+
90+
static bool vtime_before(u64 a, u64 b)
91+
{
92+
return (s64)(a - b) < 0;
93+
}
5594

5695
s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p,
5796
s32 prev_cpu, u64 wake_flags)
@@ -71,9 +110,22 @@ void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags)
71110

72111
__sync_fetch_and_add(&nr_total, 1);
73112

113+
/*
114+
* Push per-cpu kthreads at the head of local dsq's and preempt the
115+
* corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked
116+
* behind other threads which is necessary for forward progress
117+
* guarantee as we depend on the BPF timer which may run from ksoftirqd.
118+
*/
119+
if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
120+
__sync_fetch_and_add(&nr_locals, 1);
121+
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
122+
enq_flags | SCX_ENQ_PREEMPT);
123+
return;
124+
}
125+
74126
if (bpf_map_push_elem(&central_q, &pid, 0)) {
75127
__sync_fetch_and_add(&nr_overflows, 1);
76-
scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, enq_flags);
128+
scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags);
77129
return;
78130
}
79131

@@ -106,7 +158,7 @@ static bool dispatch_to_cpu(s32 cpu)
106158
*/
107159
if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
108160
__sync_fetch_and_add(&nr_mismatches, 1);
109-
scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, 0);
161+
scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
110162
bpf_task_release(p);
111163
/*
112164
* We might run out of dispatch buffer slots if we continue dispatching
@@ -120,7 +172,7 @@ static bool dispatch_to_cpu(s32 cpu)
120172
}
121173

122174
/* dispatch to local and mark that @cpu doesn't need more */
123-
scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
175+
scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0);
124176

125177
if (cpu != central_cpu)
126178
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
@@ -188,9 +240,102 @@ void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
188240
}
189241
}
190242

243+
void BPF_STRUCT_OPS(central_running, struct task_struct *p)
244+
{
245+
s32 cpu = scx_bpf_task_cpu(p);
246+
u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
247+
if (started_at)
248+
*started_at = bpf_ktime_get_ns() ?: 1; /* 0 indicates idle */
249+
}
250+
251+
void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable)
252+
{
253+
s32 cpu = scx_bpf_task_cpu(p);
254+
u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
255+
if (started_at)
256+
*started_at = 0;
257+
}
258+
259+
static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
260+
{
261+
u64 now = bpf_ktime_get_ns();
262+
u64 nr_to_kick = nr_queued;
263+
s32 i, curr_cpu;
264+
265+
curr_cpu = bpf_get_smp_processor_id();
266+
if (timer_pinned && (curr_cpu != central_cpu)) {
267+
scx_bpf_error("Central timer ran on CPU %d, not central CPU %d",
268+
curr_cpu, central_cpu);
269+
return 0;
270+
}
271+
272+
bpf_for(i, 0, nr_cpu_ids) {
273+
s32 cpu = (nr_timers + i) % nr_cpu_ids;
274+
u64 *started_at;
275+
276+
if (cpu == central_cpu)
277+
continue;
278+
279+
/* kick iff the current one exhausted its slice */
280+
started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
281+
if (started_at && *started_at &&
282+
vtime_before(now, *started_at + slice_ns))
283+
continue;
284+
285+
/* and there's something pending */
286+
if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) ||
287+
scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu))
288+
;
289+
else if (nr_to_kick)
290+
nr_to_kick--;
291+
else
292+
continue;
293+
294+
scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT);
295+
}
296+
297+
bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN);
298+
__sync_fetch_and_add(&nr_timers, 1);
299+
return 0;
300+
}
301+
191302
int BPF_STRUCT_OPS_SLEEPABLE(central_init)
192303
{
193-
return scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
304+
u32 key = 0;
305+
struct bpf_timer *timer;
306+
int ret;
307+
308+
ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
309+
if (ret)
310+
return ret;
311+
312+
timer = bpf_map_lookup_elem(&central_timer, &key);
313+
if (!timer)
314+
return -ESRCH;
315+
316+
if (bpf_get_smp_processor_id() != central_cpu) {
317+
scx_bpf_error("init from non-central CPU");
318+
return -EINVAL;
319+
}
320+
321+
bpf_timer_init(timer, &central_timer, CLOCK_MONOTONIC);
322+
bpf_timer_set_callback(timer, central_timerfn);
323+
324+
ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN);
325+
/*
326+
* BPF_F_TIMER_CPU_PIN is pretty new (>=6.7). If we're running in a
327+
* kernel which doesn't have it, bpf_timer_start() will return -EINVAL.
328+
* Retry without the PIN. This would be the perfect use case for
329+
* bpf_core_enum_value_exists() but the enum type doesn't have a name
330+
* and can't be used with bpf_core_enum_value_exists(). Oh well...
331+
*/
332+
if (ret == -EINVAL) {
333+
timer_pinned = false;
334+
ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
335+
}
336+
if (ret)
337+
scx_bpf_error("bpf_timer_start failed (%d)", ret);
338+
return ret;
194339
}
195340

196341
void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
@@ -209,6 +354,8 @@ SCX_OPS_DEFINE(central_ops,
209354
.select_cpu = (void *)central_select_cpu,
210355
.enqueue = (void *)central_enqueue,
211356
.dispatch = (void *)central_dispatch,
357+
.running = (void *)central_running,
358+
.stopping = (void *)central_stopping,
212359
.init = (void *)central_init,
213360
.exit = (void *)central_exit,
214361
.name = "central");

0 commit comments

Comments
 (0)