Skip to content

Commit c40f7d7

Browse files
torvaldsIngo Molnar
authored andcommitted
sched/fair: Fix infinite loop in update_blocked_averages() by reverting a9e7f65
Zhipeng Xie, Xie XiuQi and Sargun Dhillon reported lockups in the scheduler under high loads, starting at around the v4.18 time frame, and Zhipeng Xie tracked it down to bugs in the rq->leaf_cfs_rq_list manipulation. Do a (manual) revert of: a9e7f65 ("sched/fair: Fix O(nr_cgroups) in load balance path") It turns out that the list_del_leaf_cfs_rq() introduced by this commit is a surprising property that was not considered in followup commits such as: 9c2791f ("sched/fair: Fix hierarchical order in rq->leaf_cfs_rq_list") As Vincent Guittot explains: "I think that there is a bigger problem with commit a9e7f65 and cfs_rq throttling: Let take the example of the following topology TG2 --> TG1 --> root: 1) The 1st time a task is enqueued, we will add TG2 cfs_rq then TG1 cfs_rq to leaf_cfs_rq_list and we are sure to do the whole branch in one path because it has never been used and can't be throttled so tmp_alone_branch will point to leaf_cfs_rq_list at the end. 2) Then TG1 is throttled 3) and we add TG3 as a new child of TG1. 4) The 1st enqueue of a task on TG3 will add TG3 cfs_rq just before TG1 cfs_rq and tmp_alone_branch will stay on rq->leaf_cfs_rq_list. With commit a9e7f65, we can del a cfs_rq from rq->leaf_cfs_rq_list. So if the load of TG1 cfs_rq becomes NULL before step 2) above, TG1 cfs_rq is removed from the list. Then at step 4), TG3 cfs_rq is added at the beginning of rq->leaf_cfs_rq_list but tmp_alone_branch still points to TG3 cfs_rq because its throttled parent can't be enqueued when the lock is released. tmp_alone_branch doesn't point to rq->leaf_cfs_rq_list whereas it should. So if TG3 cfs_rq is removed or destroyed before tmp_alone_branch points on another TG cfs_rq, the next TG cfs_rq that will be added, will be linked outside rq->leaf_cfs_rq_list - which is bad. In addition, we can break the ordering of the cfs_rq in rq->leaf_cfs_rq_list but this ordering is used to update and propagate the update from leaf down to root." Instead of trying to work through all these cases and trying to reproduce the very high loads that produced the lockup to begin with, simplify the code temporarily by reverting a9e7f65 - which change was clearly not thought through completely. This (hopefully) gives us a kernel that doesn't lock up so people can continue to enjoy their holidays without worrying about regressions. ;-) [ mingo: Wrote changelog, fixed weird spelling in code comment while at it. ] Analyzed-by: Xie XiuQi <[email protected]> Analyzed-by: Vincent Guittot <[email protected]> Reported-by: Zhipeng Xie <[email protected]> Reported-by: Sargun Dhillon <[email protected]> Reported-by: Xie XiuQi <[email protected]> Tested-by: Zhipeng Xie <[email protected]> Tested-by: Sargun Dhillon <[email protected]> Signed-off-by: Linus Torvalds <[email protected]> Acked-by: Vincent Guittot <[email protected]> Cc: <[email protected]> # v4.13+ Cc: Bin Li <[email protected]> Cc: Mike Galbraith <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Tejun Heo <[email protected]> Cc: Thomas Gleixner <[email protected]> Fixes: a9e7f65 ("sched/fair: Fix O(nr_cgroups) in load balance path") Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Ingo Molnar <[email protected]>
1 parent 6d101ba commit c40f7d7

File tree

1 file changed

+9
-34
lines changed

1 file changed

+9
-34
lines changed

kernel/sched/fair.c

Lines changed: 9 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -352,10 +352,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
352352
}
353353
}
354354

355-
/* Iterate thr' all leaf cfs_rq's on a runqueue */
356-
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
357-
list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
358-
leaf_cfs_rq_list)
355+
/* Iterate through all leaf cfs_rq's on a runqueue: */
356+
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
357+
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
359358

360359
/* Do the two (enqueued) entities belong to the same group ? */
361360
static inline struct cfs_rq *
@@ -447,8 +446,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
447446
{
448447
}
449448

450-
#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
451-
for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
449+
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
450+
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
452451

453452
static inline struct sched_entity *parent_entity(struct sched_entity *se)
454453
{
@@ -7647,27 +7646,10 @@ static inline bool others_have_blocked(struct rq *rq)
76477646

76487647
#ifdef CONFIG_FAIR_GROUP_SCHED
76497648

7650-
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
7651-
{
7652-
if (cfs_rq->load.weight)
7653-
return false;
7654-
7655-
if (cfs_rq->avg.load_sum)
7656-
return false;
7657-
7658-
if (cfs_rq->avg.util_sum)
7659-
return false;
7660-
7661-
if (cfs_rq->avg.runnable_load_sum)
7662-
return false;
7663-
7664-
return true;
7665-
}
7666-
76677649
static void update_blocked_averages(int cpu)
76687650
{
76697651
struct rq *rq = cpu_rq(cpu);
7670-
struct cfs_rq *cfs_rq, *pos;
7652+
struct cfs_rq *cfs_rq;
76717653
const struct sched_class *curr_class;
76727654
struct rq_flags rf;
76737655
bool done = true;
@@ -7679,7 +7661,7 @@ static void update_blocked_averages(int cpu)
76797661
* Iterates the task_group tree in a bottom up fashion, see
76807662
* list_add_leaf_cfs_rq() for details.
76817663
*/
7682-
for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
7664+
for_each_leaf_cfs_rq(rq, cfs_rq) {
76837665
struct sched_entity *se;
76847666

76857667
/* throttled entities do not contribute to load */
@@ -7694,13 +7676,6 @@ static void update_blocked_averages(int cpu)
76947676
if (se && !skip_blocked_update(se))
76957677
update_load_avg(cfs_rq_of(se), se, 0);
76967678

7697-
/*
7698-
* There can be a lot of idle CPU cgroups. Don't let fully
7699-
* decayed cfs_rqs linger on the list.
7700-
*/
7701-
if (cfs_rq_is_decayed(cfs_rq))
7702-
list_del_leaf_cfs_rq(cfs_rq);
7703-
77047679
/* Don't need periodic decay once load/util_avg are null */
77057680
if (cfs_rq_has_blocked(cfs_rq))
77067681
done = false;
@@ -10570,10 +10545,10 @@ const struct sched_class fair_sched_class = {
1057010545
#ifdef CONFIG_SCHED_DEBUG
1057110546
void print_cfs_stats(struct seq_file *m, int cpu)
1057210547
{
10573-
struct cfs_rq *cfs_rq, *pos;
10548+
struct cfs_rq *cfs_rq;
1057410549

1057510550
rcu_read_lock();
10576-
for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
10551+
for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1057710552
print_cfs_rq(m, cpu, cfs_rq);
1057810553
rcu_read_unlock();
1057910554
}

0 commit comments

Comments
 (0)