Skip to content

Commit 5896001

Browse files
committed
sched/core: Fix migrate_swap() vs. hotplug
JIRA: https://issues.redhat.com/browse/RHEL-107437 commit 009836b Author: Peter Zijlstra <[email protected]> Date: Thu Jun 5 12:00:09 2025 +0200 sched/core: Fix migrate_swap() vs. hotplug On Mon, Jun 02, 2025 at 03:22:13PM +0800, Kuyo Chang wrote: > So, the potential race scenario is: > > CPU0 CPU1 > // doing migrate_swap(cpu0/cpu1) > stop_two_cpus() > ... > // doing _cpu_down() > sched_cpu_deactivate() > set_cpu_active(cpu, false); > balance_push_set(cpu, true); > cpu_stop_queue_two_works > __cpu_stop_queue_work(stopper1,...); > __cpu_stop_queue_work(stopper2,..); > stop_cpus_in_progress -> true > preempt_enable(); > ... > 1st balance_push > stop_one_cpu_nowait > cpu_stop_queue_work > __cpu_stop_queue_work > list_add_tail -> 1st add push_work > wake_up_q(&wakeq); -> "wakeq is empty. > This implies that the stopper is at wakeq@migrate_swap." > preempt_disable > wake_up_q(&wakeq); > wake_up_process // wakeup migrate/0 > try_to_wake_up > ttwu_queue > ttwu_queue_cond ->meet below case > if (cpu == smp_processor_id()) > return false; > ttwu_do_activate > //migrate/0 wakeup done > wake_up_process // wakeup migrate/1 > try_to_wake_up > ttwu_queue > ttwu_queue_cond > ttwu_queue_wakelist > __ttwu_queue_wakelist > __smp_call_single_queue > preempt_enable(); > > 2nd balance_push > stop_one_cpu_nowait > cpu_stop_queue_work > __cpu_stop_queue_work > list_add_tail -> 2nd add push_work, so the double list add is detected > ... > ... > cpu1 get ipi, do sched_ttwu_pending, wakeup migrate/1 > So this balance_push() is part of schedule(), and schedule() is supposed to switch to stopper task, but because of this race condition, stopper task is stuck in WAKING state and not actually visible to be picked. Therefore CPU1 can do another schedule() and end up doing another balance_push() even though the last one hasn't been done yet. This is a confluence of fail, where both wake_q and ttwu_wakelist can cause crucial wakeups to be delayed, resulting in the malfunction of balance_push. Since there is only a single stopper thread to be woken, the wake_q doesn't really add anything here, and can be removed in favour of direct wakeups of the stopper thread. Then add a clause to ttwu_queue_cond() to ensure the stopper threads are never queued / delayed. Of all 3 moving parts, the last addition was the balance_push() machinery, so pick that as the point the bug was introduced. Fixes: 2558aac ("sched/hotplug: Ensure only per-cpu kthreads run during hotplug") Reported-by: Kuyo Chang <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Tested-by: Kuyo Chang <[email protected]> Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Phil Auld <[email protected]>
1 parent d2d0070 commit 5896001

File tree

2 files changed

+15
-10
lines changed

2 files changed

+15
-10
lines changed

kernel/sched/core.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3830,6 +3830,11 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
38303830

38313831
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
38323832
{
3833+
#ifdef CONFIG_SMP
3834+
if (p->sched_class == &stop_sched_class)
3835+
return false;
3836+
#endif
3837+
38333838
/*
38343839
* Do not complicate things with the async wake_list while the CPU is
38353840
* in hotplug state.

kernel/stop_machine.c

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -82,31 +82,29 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done)
8282
}
8383

8484
static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
85-
struct cpu_stop_work *work,
86-
struct wake_q_head *wakeq)
85+
struct cpu_stop_work *work)
8786
{
8887
list_add_tail(&work->list, &stopper->works);
89-
wake_q_add(wakeq, stopper->thread);
9088
}
9189

9290
/* queue @work to @stopper. if offline, @work is completed immediately */
9391
static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
9492
{
9593
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
96-
DEFINE_WAKE_Q(wakeq);
9794
unsigned long flags;
9895
bool enabled;
9996

10097
preempt_disable();
10198
raw_spin_lock_irqsave(&stopper->lock, flags);
10299
enabled = stopper->enabled;
103100
if (enabled)
104-
__cpu_stop_queue_work(stopper, work, &wakeq);
101+
__cpu_stop_queue_work(stopper, work);
105102
else if (work->done)
106103
cpu_stop_signal_done(work->done);
107104
raw_spin_unlock_irqrestore(&stopper->lock, flags);
108105

109-
wake_up_q(&wakeq);
106+
if (enabled)
107+
wake_up_process(stopper->thread);
110108
preempt_enable();
111109

112110
return enabled;
@@ -263,7 +261,6 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
263261
{
264262
struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
265263
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
266-
DEFINE_WAKE_Q(wakeq);
267264
int err;
268265

269266
retry:
@@ -299,8 +296,8 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
299296
}
300297

301298
err = 0;
302-
__cpu_stop_queue_work(stopper1, work1, &wakeq);
303-
__cpu_stop_queue_work(stopper2, work2, &wakeq);
299+
__cpu_stop_queue_work(stopper1, work1);
300+
__cpu_stop_queue_work(stopper2, work2);
304301

305302
unlock:
306303
raw_spin_unlock(&stopper2->lock);
@@ -315,7 +312,10 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
315312
goto retry;
316313
}
317314

318-
wake_up_q(&wakeq);
315+
if (!err) {
316+
wake_up_process(stopper1->thread);
317+
wake_up_process(stopper2->thread);
318+
}
319319
preempt_enable();
320320

321321
return err;

0 commit comments

Comments
 (0)