Skip to content

Commit 4ae8d9a

Browse files
author
Peter Zijlstra
committed
sched/deadline: Fix dl_server getting stuck
John found it was easy to hit lockup warnings when running locktorture on a 2 CPU VM, which he bisected down to: commit cccb45d ("sched/deadline: Less agressive dl_server handling"). While debugging it seems there is a chance where we end up with the dl_server dequeued, with dl_se->dl_server_active. This causes dl_server_start() to return without enqueueing the dl_server, thus it fails to run when RT tasks starve the cpu. When this happens, dl_server_timer() catches the '!dl_se->server_has_tasks(dl_se)' case, which then calls replenish_dl_entity() and dl_server_stopped() and finally return HRTIMER_NO_RESTART. This ends in no new timer and also no enqueue, leaving the dl_server 'dead', allowing starvation. What should have happened is for the bandwidth timer to start the zero-laxity timer, which in turn would enqueue the dl_server and cause dl_se->server_pick_task() to be called -- which will stop the dl_server if no fair tasks are observed for a whole period. IOW, it is totally irrelevant if there are fair tasks at the moment of bandwidth refresh. This removes all dl_se->server_has_tasks() users, so remove the whole thing. Fixes: cccb45d ("sched/deadline: Less agressive dl_server handling") Reported-by: John Stultz <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Tested-by: John Stultz <[email protected]>
1 parent f83ec76 commit 4ae8d9a

File tree

4 files changed

+2
-22
lines changed

4 files changed

+2
-22
lines changed

include/linux/sched.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -733,7 +733,6 @@ struct sched_dl_entity {
733733
* runnable task.
734734
*/
735735
struct rq *rq;
736-
dl_server_has_tasks_f server_has_tasks;
737736
dl_server_pick_f server_pick_task;
738737

739738
#ifdef CONFIG_RT_MUTEXES

kernel/sched/deadline.c

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -875,7 +875,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
875875
*/
876876
if (dl_se->dl_defer && !dl_se->dl_defer_running &&
877877
dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) {
878-
if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) {
878+
if (!is_dl_boosted(dl_se)) {
879879

880880
/*
881881
* Set dl_se->dl_defer_armed and dl_throttled variables to
@@ -1152,8 +1152,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
11521152
/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
11531153
static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
11541154

1155-
static bool dl_server_stopped(struct sched_dl_entity *dl_se);
1156-
11571155
static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
11581156
{
11591157
struct rq *rq = rq_of_dl_se(dl_se);
@@ -1171,12 +1169,6 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
11711169
if (!dl_se->dl_runtime)
11721170
return HRTIMER_NORESTART;
11731171

1174-
if (!dl_se->server_has_tasks(dl_se)) {
1175-
replenish_dl_entity(dl_se);
1176-
dl_server_stopped(dl_se);
1177-
return HRTIMER_NORESTART;
1178-
}
1179-
11801172
if (dl_se->dl_defer_armed) {
11811173
/*
11821174
* First check if the server could consume runtime in background.
@@ -1625,11 +1617,9 @@ static bool dl_server_stopped(struct sched_dl_entity *dl_se)
16251617
}
16261618

16271619
void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
1628-
dl_server_has_tasks_f has_tasks,
16291620
dl_server_pick_f pick_task)
16301621
{
16311622
dl_se->rq = rq;
1632-
dl_se->server_has_tasks = has_tasks;
16331623
dl_se->server_pick_task = pick_task;
16341624
}
16351625

kernel/sched/fair.c

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8859,11 +8859,6 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru
88598859
return pick_next_task_fair(rq, prev, NULL);
88608860
}
88618861

8862-
static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
8863-
{
8864-
return !!dl_se->rq->cfs.nr_queued;
8865-
}
8866-
88678862
static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
88688863
{
88698864
return pick_task_fair(dl_se->rq);
@@ -8875,7 +8870,7 @@ void fair_server_init(struct rq *rq)
88758870

88768871
init_dl_entity(dl_se);
88778872

8878-
dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
8873+
dl_server_init(dl_se, rq, fair_server_pick_task);
88798874
}
88808875

88818876
/*

kernel/sched/sched.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -365,9 +365,6 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6
365365
*
366366
* dl_se::rq -- runqueue we belong to.
367367
*
368-
* dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the
369-
* server when it runs out of tasks to run.
370-
*
371368
* dl_se::server_pick() -- nested pick_next_task(); we yield the period if this
372369
* returns NULL.
373370
*
@@ -383,7 +380,6 @@ extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
383380
extern void dl_server_start(struct sched_dl_entity *dl_se);
384381
extern void dl_server_stop(struct sched_dl_entity *dl_se);
385382
extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
386-
dl_server_has_tasks_f has_tasks,
387383
dl_server_pick_f pick_task);
388384
extern void sched_init_dl_servers(void);
389385

0 commit comments

Comments
 (0)