Skip to content

Commit 6d337ea

Browse files
author
Peter Zijlstra
committed
sched: Fix migrate_disable() vs set_cpus_allowed_ptr()
Concurrent migrate_disable() and set_cpus_allowed_ptr() has interesting features. We rely on set_cpus_allowed_ptr() to not return until the task runs inside the provided mask. This expectation is exported to userspace. This means that any set_cpus_allowed_ptr() caller must wait until migrate_enable() allows migrations. At the same time, we don't want migrate_enable() to schedule, due to patterns like: preempt_disable(); migrate_disable(); ... migrate_enable(); preempt_enable(); And: raw_spin_lock(&B); spin_unlock(&A); this means that when migrate_enable() must restore the affinity mask, it cannot wait for completion thereof. Luck will have it that that is exactly the case where there is a pending set_cpus_allowed_ptr(), so let that provide storage for the async stop machine. Much thanks to Valentin who used TLA+ most effective and found lots of 'interesting' cases. Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Reviewed-by: Valentin Schneider <[email protected]> Reviewed-by: Daniel Bristot de Oliveira <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent af44990 commit 6d337ea

File tree

2 files changed

+207
-30
lines changed

2 files changed

+207
-30
lines changed

include/linux/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,7 @@ struct task_struct {
714714
int nr_cpus_allowed;
715715
const cpumask_t *cpus_ptr;
716716
cpumask_t cpus_mask;
717+
void *migration_pending;
717718
#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
718719
int migration_disabled;
719720
#endif

kernel/sched/core.c

Lines changed: 206 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1732,15 +1732,26 @@ void migrate_enable(void)
17321732
{
17331733
struct task_struct *p = current;
17341734

1735-
if (--p->migration_disabled)
1735+
if (p->migration_disabled > 1) {
1736+
p->migration_disabled--;
17361737
return;
1738+
}
17371739

1740+
/*
1741+
* Ensure stop_task runs either before or after this, and that
1742+
* __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
1743+
*/
1744+
preempt_disable();
1745+
if (p->cpus_ptr != &p->cpus_mask)
1746+
__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
1747+
/*
1748+
* Mustn't clear migration_disabled() until cpus_ptr points back at the
1749+
* regular cpus_mask, otherwise things that race (eg.
1750+
* select_fallback_rq) get confused.
1751+
*/
17381752
barrier();
1739-
1740-
if (p->cpus_ptr == &p->cpus_mask)
1741-
return;
1742-
1743-
__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
1753+
p->migration_disabled = 0;
1754+
preempt_enable();
17441755
}
17451756
EXPORT_SYMBOL_GPL(migrate_enable);
17461757

@@ -1805,8 +1816,16 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
18051816
}
18061817

18071818
struct migration_arg {
1808-
struct task_struct *task;
1809-
int dest_cpu;
1819+
struct task_struct *task;
1820+
int dest_cpu;
1821+
struct set_affinity_pending *pending;
1822+
};
1823+
1824+
struct set_affinity_pending {
1825+
refcount_t refs;
1826+
struct completion done;
1827+
struct cpu_stop_work stop_work;
1828+
struct migration_arg arg;
18101829
};
18111830

18121831
/*
@@ -1838,16 +1857,19 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
18381857
*/
18391858
static int migration_cpu_stop(void *data)
18401859
{
1860+
struct set_affinity_pending *pending;
18411861
struct migration_arg *arg = data;
18421862
struct task_struct *p = arg->task;
1863+
int dest_cpu = arg->dest_cpu;
18431864
struct rq *rq = this_rq();
1865+
bool complete = false;
18441866
struct rq_flags rf;
18451867

18461868
/*
18471869
* The original target CPU might have gone down and we might
18481870
* be on another CPU but it doesn't matter.
18491871
*/
1850-
local_irq_disable();
1872+
local_irq_save(rf.flags);
18511873
/*
18521874
* We need to explicitly wake pending tasks before running
18531875
* __migrate_task() such that we will not miss enforcing cpus_ptr
@@ -1857,21 +1879,83 @@ static int migration_cpu_stop(void *data)
18571879

18581880
raw_spin_lock(&p->pi_lock);
18591881
rq_lock(rq, &rf);
1882+
1883+
pending = p->migration_pending;
18601884
/*
18611885
* If task_rq(p) != rq, it cannot be migrated here, because we're
18621886
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
18631887
* we're holding p->pi_lock.
18641888
*/
18651889
if (task_rq(p) == rq) {
1890+
if (is_migration_disabled(p))
1891+
goto out;
1892+
1893+
if (pending) {
1894+
p->migration_pending = NULL;
1895+
complete = true;
1896+
}
1897+
1898+
/* migrate_enable() -- we must not race against SCA */
1899+
if (dest_cpu < 0) {
1900+
/*
1901+
* When this was migrate_enable() but we no longer
1902+
* have a @pending, a concurrent SCA 'fixed' things
1903+
* and we should be valid again. Nothing to do.
1904+
*/
1905+
if (!pending) {
1906+
WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
1907+
goto out;
1908+
}
1909+
1910+
dest_cpu = cpumask_any_distribute(&p->cpus_mask);
1911+
}
1912+
18661913
if (task_on_rq_queued(p))
1867-
rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1914+
rq = __migrate_task(rq, &rf, p, dest_cpu);
18681915
else
1869-
p->wake_cpu = arg->dest_cpu;
1916+
p->wake_cpu = dest_cpu;
1917+
1918+
} else if (dest_cpu < 0) {
1919+
/*
1920+
* This happens when we get migrated between migrate_enable()'s
1921+
* preempt_enable() and scheduling the stopper task. At that
1922+
* point we're a regular task again and not current anymore.
1923+
*
1924+
* A !PREEMPT kernel has a giant hole here, which makes it far
1925+
* more likely.
1926+
*/
1927+
1928+
/*
1929+
* When this was migrate_enable() but we no longer have an
1930+
* @pending, a concurrent SCA 'fixed' things and we should be
1931+
* valid again. Nothing to do.
1932+
*/
1933+
if (!pending) {
1934+
WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq)));
1935+
goto out;
1936+
}
1937+
1938+
/*
1939+
* When migrate_enable() hits a rq mis-match we can't reliably
1940+
* determine is_migration_disabled() and so have to chase after
1941+
* it.
1942+
*/
1943+
task_rq_unlock(rq, p, &rf);
1944+
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
1945+
&pending->arg, &pending->stop_work);
1946+
return 0;
18701947
}
1871-
rq_unlock(rq, &rf);
1872-
raw_spin_unlock(&p->pi_lock);
1948+
out:
1949+
task_rq_unlock(rq, p, &rf);
1950+
1951+
if (complete)
1952+
complete_all(&pending->done);
1953+
1954+
/* For pending->{arg,stop_work} */
1955+
pending = arg->pending;
1956+
if (pending && refcount_dec_and_test(&pending->refs))
1957+
wake_up_var(&pending->refs);
18731958

1874-
local_irq_enable();
18751959
return 0;
18761960
}
18771961

@@ -1940,6 +2024,112 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
19402024
__do_set_cpus_allowed(p, new_mask, 0);
19412025
}
19422026

2027+
/*
2028+
* This function is wildly self concurrent, consider at least 3 times.
2029+
*/
2030+
static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
2031+
int dest_cpu, unsigned int flags)
2032+
{
2033+
struct set_affinity_pending my_pending = { }, *pending = NULL;
2034+
struct migration_arg arg = {
2035+
.task = p,
2036+
.dest_cpu = dest_cpu,
2037+
};
2038+
bool complete = false;
2039+
2040+
/* Can the task run on the task's current CPU? If so, we're done */
2041+
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
2042+
pending = p->migration_pending;
2043+
if (pending) {
2044+
refcount_inc(&pending->refs);
2045+
p->migration_pending = NULL;
2046+
complete = true;
2047+
}
2048+
task_rq_unlock(rq, p, rf);
2049+
2050+
if (complete)
2051+
goto do_complete;
2052+
2053+
return 0;
2054+
}
2055+
2056+
if (!(flags & SCA_MIGRATE_ENABLE)) {
2057+
/* serialized by p->pi_lock */
2058+
if (!p->migration_pending) {
2059+
refcount_set(&my_pending.refs, 1);
2060+
init_completion(&my_pending.done);
2061+
p->migration_pending = &my_pending;
2062+
} else {
2063+
pending = p->migration_pending;
2064+
refcount_inc(&pending->refs);
2065+
}
2066+
}
2067+
pending = p->migration_pending;
2068+
/*
2069+
* - !MIGRATE_ENABLE:
2070+
* we'll have installed a pending if there wasn't one already.
2071+
*
2072+
* - MIGRATE_ENABLE:
2073+
* we're here because the current CPU isn't matching anymore,
2074+
* the only way that can happen is because of a concurrent
2075+
* set_cpus_allowed_ptr() call, which should then still be
2076+
* pending completion.
2077+
*
2078+
* Either way, we really should have a @pending here.
2079+
*/
2080+
if (WARN_ON_ONCE(!pending)) {
2081+
task_rq_unlock(rq, p, rf);
2082+
return -EINVAL;
2083+
}
2084+
2085+
if (flags & SCA_MIGRATE_ENABLE) {
2086+
2087+
refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
2088+
task_rq_unlock(rq, p, rf);
2089+
2090+
pending->arg = (struct migration_arg) {
2091+
.task = p,
2092+
.dest_cpu = -1,
2093+
.pending = pending,
2094+
};
2095+
2096+
stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
2097+
&pending->arg, &pending->stop_work);
2098+
2099+
return 0;
2100+
}
2101+
2102+
if (task_running(rq, p) || p->state == TASK_WAKING) {
2103+
2104+
task_rq_unlock(rq, p, rf);
2105+
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
2106+
2107+
} else {
2108+
2109+
if (!is_migration_disabled(p)) {
2110+
if (task_on_rq_queued(p))
2111+
rq = move_queued_task(rq, rf, p, dest_cpu);
2112+
2113+
p->migration_pending = NULL;
2114+
complete = true;
2115+
}
2116+
task_rq_unlock(rq, p, rf);
2117+
2118+
do_complete:
2119+
if (complete)
2120+
complete_all(&pending->done);
2121+
}
2122+
2123+
wait_for_completion(&pending->done);
2124+
2125+
if (refcount_dec_and_test(&pending->refs))
2126+
wake_up_var(&pending->refs);
2127+
2128+
wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
2129+
2130+
return 0;
2131+
}
2132+
19432133
/*
19442134
* Change a given task's CPU affinity. Migrate the thread to a
19452135
* proper CPU and schedule it away if the CPU it's executing on
@@ -2009,23 +2199,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
20092199
p->nr_cpus_allowed != 1);
20102200
}
20112201

2012-
/* Can the task run on the task's current CPU? If so, we're done */
2013-
if (cpumask_test_cpu(task_cpu(p), new_mask))
2014-
goto out;
2202+
return affine_move_task(rq, p, &rf, dest_cpu, flags);
20152203

2016-
if (task_running(rq, p) || p->state == TASK_WAKING) {
2017-
struct migration_arg arg = { p, dest_cpu };
2018-
/* Need help from migration thread: drop lock and wait. */
2019-
task_rq_unlock(rq, p, &rf);
2020-
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
2021-
return 0;
2022-
} else if (task_on_rq_queued(p)) {
2023-
/*
2024-
* OK, since we're going to drop the lock immediately
2025-
* afterwards anyway.
2026-
*/
2027-
rq = move_queued_task(rq, &rf, p, dest_cpu);
2028-
}
20292204
out:
20302205
task_rq_unlock(rq, p, &rf);
20312206

@@ -3205,6 +3380,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
32053380
init_numa_balancing(clone_flags, p);
32063381
#ifdef CONFIG_SMP
32073382
p->wake_entry.u_flags = CSD_TYPE_TTWU;
3383+
p->migration_pending = NULL;
32083384
#endif
32093385
}
32103386

0 commit comments

Comments
 (0)