Skip to content

Commit 1cf12e0

Browse files
KAGA-KOKOPeter Zijlstra
authored andcommitted
sched/hotplug: Consolidate task migration on CPU unplug
With the new mechanism which kicks tasks off the outgoing CPU at the end of schedule() the situation on an outgoing CPU right before the stopper thread brings it down completely is: - All user tasks and all unbound kernel threads have either been migrated away or are not running and the next wakeup will move them to a online CPU. - All per CPU kernel threads, except cpu hotplug thread and the stopper thread have either been unbound or parked by the responsible CPU hotplug callback. That means that at the last step before the stopper thread is invoked the cpu hotplug thread is the last legitimate running task on the outgoing CPU. Add a final wait step right before the stopper thread is kicked which ensures that any still running tasks on the way to park or on the way to kick themself of the CPU are either sleeping or gone. This allows to remove the migrate_tasks() crutch in sched_cpu_dying(). If sched_cpu_dying() detects that there is still another running task aside of the stopper thread then it will explode with the appropriate fireworks. Signed-off-by: Thomas Gleixner <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Reviewed-by: Valentin Schneider <[email protected]> Reviewed-by: Daniel Bristot de Oliveira <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent 0624973 commit 1cf12e0

File tree

4 files changed

+46
-120
lines changed

4 files changed

+46
-120
lines changed

include/linux/cpuhotplug.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ enum cpuhp_state {
152152
CPUHP_AP_ONLINE,
153153
CPUHP_TEARDOWN_CPU,
154154
CPUHP_AP_ONLINE_IDLE,
155+
CPUHP_AP_SCHED_WAIT_EMPTY,
155156
CPUHP_AP_SMPBOOT_THREADS,
156157
CPUHP_AP_X86_VDSO_VMA_ONLINE,
157158
CPUHP_AP_IRQ_AFFINITY_ONLINE,

include/linux/sched/hotplug.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu);
1111
extern int sched_cpu_deactivate(unsigned int cpu);
1212

1313
#ifdef CONFIG_HOTPLUG_CPU
14+
extern int sched_cpu_wait_empty(unsigned int cpu);
1415
extern int sched_cpu_dying(unsigned int cpu);
1516
#else
17+
# define sched_cpu_wait_empty NULL
1618
# define sched_cpu_dying NULL
1719
#endif
1820

kernel/cpu.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1602,7 +1602,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
16021602
.name = "ap:online",
16031603
},
16041604
/*
1605-
* Handled on controll processor until the plugged processor manages
1605+
* Handled on control processor until the plugged processor manages
16061606
* this itself.
16071607
*/
16081608
[CPUHP_TEARDOWN_CPU] = {
@@ -1611,6 +1611,13 @@ static struct cpuhp_step cpuhp_hp_states[] = {
16111611
.teardown.single = takedown_cpu,
16121612
.cant_stop = true,
16131613
},
1614+
1615+
[CPUHP_AP_SCHED_WAIT_EMPTY] = {
1616+
.name = "sched:waitempty",
1617+
.startup.single = NULL,
1618+
.teardown.single = sched_cpu_wait_empty,
1619+
},
1620+
16141621
/* Handle smpboot threads park/unpark */
16151622
[CPUHP_AP_SMPBOOT_THREADS] = {
16161623
.name = "smpboot/threads:online",

kernel/sched/core.c

Lines changed: 35 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -6741,120 +6741,6 @@ void idle_task_exit(void)
67416741
/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
67426742
}
67436743

6744-
/*
6745-
* Since this CPU is going 'away' for a while, fold any nr_active delta
6746-
* we might have. Assumes we're called after migrate_tasks() so that the
6747-
* nr_active count is stable. We need to take the teardown thread which
6748-
* is calling this into account, so we hand in adjust = 1 to the load
6749-
* calculation.
6750-
*
6751-
* Also see the comment "Global load-average calculations".
6752-
*/
6753-
static void calc_load_migrate(struct rq *rq)
6754-
{
6755-
long delta = calc_load_fold_active(rq, 1);
6756-
if (delta)
6757-
atomic_long_add(delta, &calc_load_tasks);
6758-
}
6759-
6760-
static struct task_struct *__pick_migrate_task(struct rq *rq)
6761-
{
6762-
const struct sched_class *class;
6763-
struct task_struct *next;
6764-
6765-
for_each_class(class) {
6766-
next = class->pick_next_task(rq);
6767-
if (next) {
6768-
next->sched_class->put_prev_task(rq, next);
6769-
return next;
6770-
}
6771-
}
6772-
6773-
/* The idle class should always have a runnable task */
6774-
BUG();
6775-
}
6776-
6777-
/*
6778-
* Migrate all tasks from the rq, sleeping tasks will be migrated by
6779-
* try_to_wake_up()->select_task_rq().
6780-
*
6781-
* Called with rq->lock held even though we'er in stop_machine() and
6782-
* there's no concurrency possible, we hold the required locks anyway
6783-
* because of lock validation efforts.
6784-
*/
6785-
static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
6786-
{
6787-
struct rq *rq = dead_rq;
6788-
struct task_struct *next, *stop = rq->stop;
6789-
struct rq_flags orf = *rf;
6790-
int dest_cpu;
6791-
6792-
/*
6793-
* Fudge the rq selection such that the below task selection loop
6794-
* doesn't get stuck on the currently eligible stop task.
6795-
*
6796-
* We're currently inside stop_machine() and the rq is either stuck
6797-
* in the stop_machine_cpu_stop() loop, or we're executing this code,
6798-
* either way we should never end up calling schedule() until we're
6799-
* done here.
6800-
*/
6801-
rq->stop = NULL;
6802-
6803-
/*
6804-
* put_prev_task() and pick_next_task() sched
6805-
* class method both need to have an up-to-date
6806-
* value of rq->clock[_task]
6807-
*/
6808-
update_rq_clock(rq);
6809-
6810-
for (;;) {
6811-
/*
6812-
* There's this thread running, bail when that's the only
6813-
* remaining thread:
6814-
*/
6815-
if (rq->nr_running == 1)
6816-
break;
6817-
6818-
next = __pick_migrate_task(rq);
6819-
6820-
/*
6821-
* Rules for changing task_struct::cpus_mask are holding
6822-
* both pi_lock and rq->lock, such that holding either
6823-
* stabilizes the mask.
6824-
*
6825-
* Drop rq->lock is not quite as disastrous as it usually is
6826-
* because !cpu_active at this point, which means load-balance
6827-
* will not interfere. Also, stop-machine.
6828-
*/
6829-
rq_unlock(rq, rf);
6830-
raw_spin_lock(&next->pi_lock);
6831-
rq_relock(rq, rf);
6832-
6833-
/*
6834-
* Since we're inside stop-machine, _nothing_ should have
6835-
* changed the task, WARN if weird stuff happened, because in
6836-
* that case the above rq->lock drop is a fail too.
6837-
*/
6838-
if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
6839-
raw_spin_unlock(&next->pi_lock);
6840-
continue;
6841-
}
6842-
6843-
/* Find suitable destination for @next, with force if needed. */
6844-
dest_cpu = select_fallback_rq(dead_rq->cpu, next);
6845-
rq = __migrate_task(rq, rf, next, dest_cpu);
6846-
if (rq != dead_rq) {
6847-
rq_unlock(rq, rf);
6848-
rq = dead_rq;
6849-
*rf = orf;
6850-
rq_relock(rq, rf);
6851-
}
6852-
raw_spin_unlock(&next->pi_lock);
6853-
}
6854-
6855-
rq->stop = stop;
6856-
}
6857-
68586744
static int __balance_push_cpu_stop(void *arg)
68596745
{
68606746
struct task_struct *p = arg;
@@ -7123,10 +7009,6 @@ int sched_cpu_deactivate(unsigned int cpu)
71237009
return ret;
71247010
}
71257011
sched_domains_numa_masks_clear(cpu);
7126-
7127-
/* Wait for all non per CPU kernel threads to vanish. */
7128-
balance_hotplug_wait();
7129-
71307012
return 0;
71317013
}
71327014

@@ -7146,6 +7028,41 @@ int sched_cpu_starting(unsigned int cpu)
71467028
}
71477029

71487030
#ifdef CONFIG_HOTPLUG_CPU
7031+
7032+
/*
7033+
* Invoked immediately before the stopper thread is invoked to bring the
7034+
* CPU down completely. At this point all per CPU kthreads except the
7035+
* hotplug thread (current) and the stopper thread (inactive) have been
7036+
* either parked or have been unbound from the outgoing CPU. Ensure that
7037+
* any of those which might be on the way out are gone.
7038+
*
7039+
* If after this point a bound task is being woken on this CPU then the
7040+
* responsible hotplug callback has failed to do it's job.
7041+
* sched_cpu_dying() will catch it with the appropriate fireworks.
7042+
*/
7043+
int sched_cpu_wait_empty(unsigned int cpu)
7044+
{
7045+
balance_hotplug_wait();
7046+
return 0;
7047+
}
7048+
7049+
/*
7050+
* Since this CPU is going 'away' for a while, fold any nr_active delta we
7051+
* might have. Called from the CPU stopper task after ensuring that the
7052+
* stopper is the last running task on the CPU, so nr_active count is
7053+
* stable. We need to take the teardown thread which is calling this into
7054+
* account, so we hand in adjust = 1 to the load calculation.
7055+
*
7056+
* Also see the comment "Global load-average calculations".
7057+
*/
7058+
static void calc_load_migrate(struct rq *rq)
7059+
{
7060+
long delta = calc_load_fold_active(rq, 1);
7061+
7062+
if (delta)
7063+
atomic_long_add(delta, &calc_load_tasks);
7064+
}
7065+
71497066
int sched_cpu_dying(unsigned int cpu)
71507067
{
71517068
struct rq *rq = cpu_rq(cpu);
@@ -7159,7 +7076,6 @@ int sched_cpu_dying(unsigned int cpu)
71597076
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
71607077
set_rq_offline(rq);
71617078
}
7162-
migrate_tasks(rq, &rf);
71637079
BUG_ON(rq->nr_running != 1);
71647080
rq_unlock_irqrestore(rq, &rf);
71657081

0 commit comments

Comments
 (0)