Skip to content

Commit a7c8155

Browse files
author
Peter Zijlstra
committed
sched: Fix migrate_disable() vs rt/dl balancing
In order to minimize the interference of migrate_disable() on lower priority tasks, which can be deprived of runtime due to being stuck below a higher priority task. Teach the RT/DL balancers to push away these higher priority tasks when a lower priority task gets selected to run on a freshly demoted CPU (pull). This adds migration interference to the higher priority task, but restores bandwidth to system that would otherwise be irrevocably lost. Without this it would be possible to have all tasks on the system stuck on a single CPU, each task preempted in a migrate_disable() section with a single high priority task running. This way we can still approximate running the M highest priority tasks on the system. Migrating the top task away is (ofcourse) still subject to migrate_disable() too, which means the lower task is subject to an interference equivalent to the worst case migrate_disable() section. Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Reviewed-by: Daniel Bristot de Oliveira <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent ded467d commit a7c8155

File tree

6 files changed

+186
-48
lines changed

6 files changed

+186
-48
lines changed

include/linux/preempt.h

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -325,24 +325,28 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
325325
#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
326326

327327
/*
328-
* Migrate-Disable and why it is (strongly) undesired.
329-
*
330-
* The premise of the Real-Time schedulers we have on Linux
331-
* (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks
332-
* concurrently, provided there are sufficient runnable tasks, also known as
333-
* work-conserving. For instance SCHED_DEADLINE tries to schedule the M
334-
* earliest deadline threads, and SCHED_FIFO the M highest priority threads.
335-
*
336-
* The correctness of various scheduling models depends on this, but is it
337-
* broken by migrate_disable() that doesn't imply preempt_disable(). Where
338-
* preempt_disable() implies an immediate priority ceiling, preemptible
339-
* migrate_disable() allows nesting.
340-
*
341-
* The worst case is that all tasks preempt one another in a migrate_disable()
342-
* region and stack on a single CPU. This then reduces the available bandwidth
343-
* to a single CPU. And since Real-Time schedulability theory considers the
344-
* Worst-Case only, all Real-Time analysis shall revert to single-CPU
345-
* (instantly solving the SMP analysis problem).
328+
* Migrate-Disable and why it is undesired.
329+
*
330+
* When a preempted task becomes elegible to run under the ideal model (IOW it
331+
* becomes one of the M highest priority tasks), it might still have to wait
332+
* for the preemptee's migrate_disable() section to complete. Thereby suffering
333+
* a reduction in bandwidth in the exact duration of the migrate_disable()
334+
* section.
335+
*
336+
* Per this argument, the change from preempt_disable() to migrate_disable()
337+
* gets us:
338+
*
339+
* - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
340+
* it would have had to wait for the lower priority task.
341+
*
342+
* - a lower priority tasks; which under preempt_disable() could've instantly
343+
* migrated away when another CPU becomes available, is now constrained
344+
* by the ability to push the higher priority task away, which might itself be
345+
* in a migrate_disable() section, reducing it's available bandwidth.
346+
*
347+
* IOW it trades latency / moves the interference term, but it stays in the
348+
* system, and as long as it remains unbounded, the system is not fully
349+
* deterministic.
346350
*
347351
*
348352
* The reason we have it anyway.

include/linux/sched.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -716,8 +716,9 @@ struct task_struct {
716716
cpumask_t cpus_mask;
717717
void *migration_pending;
718718
#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT)
719-
int migration_disabled;
719+
unsigned short migration_disabled;
720720
#endif
721+
unsigned short migration_flags;
721722

722723
#ifdef CONFIG_PREEMPT_RCU
723724
int rcu_read_lock_nesting;

kernel/sched/core.c

Lines changed: 57 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1763,11 +1763,6 @@ void migrate_enable(void)
17631763
}
17641764
EXPORT_SYMBOL_GPL(migrate_enable);
17651765

1766-
static inline bool is_migration_disabled(struct task_struct *p)
1767-
{
1768-
return p->migration_disabled;
1769-
}
1770-
17711766
static inline bool rq_has_pinned_tasks(struct rq *rq)
17721767
{
17731768
return rq->nr_pinned;
@@ -1972,6 +1967,49 @@ static int migration_cpu_stop(void *data)
19721967
return 0;
19731968
}
19741969

1970+
int push_cpu_stop(void *arg)
1971+
{
1972+
struct rq *lowest_rq = NULL, *rq = this_rq();
1973+
struct task_struct *p = arg;
1974+
1975+
raw_spin_lock_irq(&p->pi_lock);
1976+
raw_spin_lock(&rq->lock);
1977+
1978+
if (task_rq(p) != rq)
1979+
goto out_unlock;
1980+
1981+
if (is_migration_disabled(p)) {
1982+
p->migration_flags |= MDF_PUSH;
1983+
goto out_unlock;
1984+
}
1985+
1986+
p->migration_flags &= ~MDF_PUSH;
1987+
1988+
if (p->sched_class->find_lock_rq)
1989+
lowest_rq = p->sched_class->find_lock_rq(p, rq);
1990+
1991+
if (!lowest_rq)
1992+
goto out_unlock;
1993+
1994+
// XXX validate p is still the highest prio task
1995+
if (task_rq(p) == rq) {
1996+
deactivate_task(rq, p, 0);
1997+
set_task_cpu(p, lowest_rq->cpu);
1998+
activate_task(lowest_rq, p, 0);
1999+
resched_curr(lowest_rq);
2000+
}
2001+
2002+
double_unlock_balance(rq, lowest_rq);
2003+
2004+
out_unlock:
2005+
rq->push_busy = false;
2006+
raw_spin_unlock(&rq->lock);
2007+
raw_spin_unlock_irq(&p->pi_lock);
2008+
2009+
put_task_struct(p);
2010+
return 0;
2011+
}
2012+
19752013
/*
19762014
* sched_class::set_cpus_allowed must do the below, but is not required to
19772015
* actually call this function.
@@ -2052,6 +2090,14 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
20522090

20532091
/* Can the task run on the task's current CPU? If so, we're done */
20542092
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
2093+
struct task_struct *push_task = NULL;
2094+
2095+
if ((flags & SCA_MIGRATE_ENABLE) &&
2096+
(p->migration_flags & MDF_PUSH) && !rq->push_busy) {
2097+
rq->push_busy = true;
2098+
push_task = get_task_struct(p);
2099+
}
2100+
20552101
pending = p->migration_pending;
20562102
if (pending) {
20572103
refcount_inc(&pending->refs);
@@ -2060,6 +2106,11 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
20602106
}
20612107
task_rq_unlock(rq, p, rf);
20622108

2109+
if (push_task) {
2110+
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2111+
p, &rq->push_work);
2112+
}
2113+
20632114
if (complete)
20642115
goto do_complete;
20652116

@@ -2098,6 +2149,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
20982149
if (flags & SCA_MIGRATE_ENABLE) {
20992150

21002151
refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
2152+
p->migration_flags &= ~MDF_PUSH;
21012153
task_rq_unlock(rq, p, rf);
21022154

21032155
pending->arg = (struct migration_arg) {
@@ -2716,11 +2768,6 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
27162768

27172769
static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
27182770

2719-
static inline bool is_migration_disabled(struct task_struct *p)
2720-
{
2721-
return false;
2722-
}
2723-
27242771
static inline bool rq_has_pinned_tasks(struct rq *rq)
27252772
{
27262773
return false;

kernel/sched/deadline.c

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2129,6 +2129,9 @@ static int push_dl_task(struct rq *rq)
21292129
return 0;
21302130

21312131
retry:
2132+
if (is_migration_disabled(next_task))
2133+
return 0;
2134+
21322135
if (WARN_ON(next_task == rq->curr))
21332136
return 0;
21342137

@@ -2206,7 +2209,7 @@ static void push_dl_tasks(struct rq *rq)
22062209
static void pull_dl_task(struct rq *this_rq)
22072210
{
22082211
int this_cpu = this_rq->cpu, cpu;
2209-
struct task_struct *p;
2212+
struct task_struct *p, *push_task;
22102213
bool resched = false;
22112214
struct rq *src_rq;
22122215
u64 dmin = LONG_MAX;
@@ -2236,6 +2239,7 @@ static void pull_dl_task(struct rq *this_rq)
22362239
continue;
22372240

22382241
/* Might drop this_rq->lock */
2242+
push_task = NULL;
22392243
double_lock_balance(this_rq, src_rq);
22402244

22412245
/*
@@ -2267,17 +2271,27 @@ static void pull_dl_task(struct rq *this_rq)
22672271
src_rq->curr->dl.deadline))
22682272
goto skip;
22692273

2270-
resched = true;
2271-
2272-
deactivate_task(src_rq, p, 0);
2273-
set_task_cpu(p, this_cpu);
2274-
activate_task(this_rq, p, 0);
2275-
dmin = p->dl.deadline;
2274+
if (is_migration_disabled(p)) {
2275+
push_task = get_push_task(src_rq);
2276+
} else {
2277+
deactivate_task(src_rq, p, 0);
2278+
set_task_cpu(p, this_cpu);
2279+
activate_task(this_rq, p, 0);
2280+
dmin = p->dl.deadline;
2281+
resched = true;
2282+
}
22762283

22772284
/* Is there any other task even earlier? */
22782285
}
22792286
skip:
22802287
double_unlock_balance(this_rq, src_rq);
2288+
2289+
if (push_task) {
2290+
raw_spin_unlock(&this_rq->lock);
2291+
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
2292+
push_task, &src_rq->push_work);
2293+
raw_spin_lock(&this_rq->lock);
2294+
}
22812295
}
22822296

22832297
if (resched)
@@ -2524,6 +2538,7 @@ const struct sched_class dl_sched_class
25242538
.rq_online = rq_online_dl,
25252539
.rq_offline = rq_offline_dl,
25262540
.task_woken = task_woken_dl,
2541+
.find_lock_rq = find_lock_later_rq,
25272542
#endif
25282543

25292544
.task_tick = task_tick_dl,

kernel/sched/rt.c

Lines changed: 51 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1859,7 +1859,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
18591859
* running task can migrate over to a CPU that is running a task
18601860
* of lesser priority.
18611861
*/
1862-
static int push_rt_task(struct rq *rq)
1862+
static int push_rt_task(struct rq *rq, bool pull)
18631863
{
18641864
struct task_struct *next_task;
18651865
struct rq *lowest_rq;
@@ -1873,6 +1873,34 @@ static int push_rt_task(struct rq *rq)
18731873
return 0;
18741874

18751875
retry:
1876+
if (is_migration_disabled(next_task)) {
1877+
struct task_struct *push_task = NULL;
1878+
int cpu;
1879+
1880+
if (!pull || rq->push_busy)
1881+
return 0;
1882+
1883+
cpu = find_lowest_rq(rq->curr);
1884+
if (cpu == -1 || cpu == rq->cpu)
1885+
return 0;
1886+
1887+
/*
1888+
* Given we found a CPU with lower priority than @next_task,
1889+
* therefore it should be running. However we cannot migrate it
1890+
* to this other CPU, instead attempt to push the current
1891+
* running task on this CPU away.
1892+
*/
1893+
push_task = get_push_task(rq);
1894+
if (push_task) {
1895+
raw_spin_unlock(&rq->lock);
1896+
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
1897+
push_task, &rq->push_work);
1898+
raw_spin_lock(&rq->lock);
1899+
}
1900+
1901+
return 0;
1902+
}
1903+
18761904
if (WARN_ON(next_task == rq->curr))
18771905
return 0;
18781906

@@ -1927,12 +1955,10 @@ static int push_rt_task(struct rq *rq)
19271955
deactivate_task(rq, next_task, 0);
19281956
set_task_cpu(next_task, lowest_rq->cpu);
19291957
activate_task(lowest_rq, next_task, 0);
1930-
ret = 1;
1931-
19321958
resched_curr(lowest_rq);
1959+
ret = 1;
19331960

19341961
double_unlock_balance(rq, lowest_rq);
1935-
19361962
out:
19371963
put_task_struct(next_task);
19381964

@@ -1942,7 +1968,7 @@ static int push_rt_task(struct rq *rq)
19421968
static void push_rt_tasks(struct rq *rq)
19431969
{
19441970
/* push_rt_task will return true if it moved an RT */
1945-
while (push_rt_task(rq))
1971+
while (push_rt_task(rq, false))
19461972
;
19471973
}
19481974

@@ -2095,7 +2121,8 @@ void rto_push_irq_work_func(struct irq_work *work)
20952121
*/
20962122
if (has_pushable_tasks(rq)) {
20972123
raw_spin_lock(&rq->lock);
2098-
push_rt_tasks(rq);
2124+
while (push_rt_task(rq, true))
2125+
;
20992126
raw_spin_unlock(&rq->lock);
21002127
}
21012128

@@ -2120,7 +2147,7 @@ static void pull_rt_task(struct rq *this_rq)
21202147
{
21212148
int this_cpu = this_rq->cpu, cpu;
21222149
bool resched = false;
2123-
struct task_struct *p;
2150+
struct task_struct *p, *push_task;
21242151
struct rq *src_rq;
21252152
int rt_overload_count = rt_overloaded(this_rq);
21262153

@@ -2167,6 +2194,7 @@ static void pull_rt_task(struct rq *this_rq)
21672194
* double_lock_balance, and another CPU could
21682195
* alter this_rq
21692196
*/
2197+
push_task = NULL;
21702198
double_lock_balance(this_rq, src_rq);
21712199

21722200
/*
@@ -2194,11 +2222,14 @@ static void pull_rt_task(struct rq *this_rq)
21942222
if (p->prio < src_rq->curr->prio)
21952223
goto skip;
21962224

2197-
resched = true;
2198-
2199-
deactivate_task(src_rq, p, 0);
2200-
set_task_cpu(p, this_cpu);
2201-
activate_task(this_rq, p, 0);
2225+
if (is_migration_disabled(p)) {
2226+
push_task = get_push_task(src_rq);
2227+
} else {
2228+
deactivate_task(src_rq, p, 0);
2229+
set_task_cpu(p, this_cpu);
2230+
activate_task(this_rq, p, 0);
2231+
resched = true;
2232+
}
22022233
/*
22032234
* We continue with the search, just in
22042235
* case there's an even higher prio task
@@ -2208,6 +2239,13 @@ static void pull_rt_task(struct rq *this_rq)
22082239
}
22092240
skip:
22102241
double_unlock_balance(this_rq, src_rq);
2242+
2243+
if (push_task) {
2244+
raw_spin_unlock(&this_rq->lock);
2245+
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
2246+
push_task, &src_rq->push_work);
2247+
raw_spin_lock(&this_rq->lock);
2248+
}
22112249
}
22122250

22132251
if (resched)
@@ -2449,6 +2487,7 @@ const struct sched_class rt_sched_class
24492487
.rq_offline = rq_offline_rt,
24502488
.task_woken = task_woken_rt,
24512489
.switched_from = switched_from_rt,
2490+
.find_lock_rq = find_lock_lowest_rq,
24522491
#endif
24532492

24542493
.task_tick = task_tick_rt,

0 commit comments

Comments
 (0)