Skip to content

Commit 6ae7143

Browse files
committed
Merge tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Borislav Petkov: "Mostly minor things this time; some highlights: - core-sched: Add 'Forced Idle' accounting; this allows to track how much CPU time is 'lost' due to core scheduling constraints. - psi: Fix for MEM_FULL; a task running reclaim would be counted as a runnable task and prevent MEM_FULL from being reported. - cpuacct: Long standing fixes for some cgroup accounting issues. - rt: Bandwidth timer could, under unusual circumstances, be failed to armed, leading to indefinite throttling." [ Description above by Peter Zijlstra ] * tag 'sched_core_for_v5.17_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Replace CFS internal cpu_util() with cpu_util_cfs() sched/fair: Cleanup task_util and capacity type sched/rt: Try to restart rt period timer when rt runtime exceeded sched/fair: Document the slow path and fast path in select_task_rq_fair sched/fair: Fix per-CPU kthread and wakee stacking for asym CPU capacity sched/fair: Fix detection of per-CPU kthreads waking a task sched/cpuacct: Make user/system times in cpuacct.stat more precise sched/cpuacct: Fix user/system in shown cpuacct.usage* cpuacct: Convert BUG_ON() to WARN_ON_ONCE() cputime, cpuacct: Include guest time in user time in cpuacct.stat psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim sched/core: Forced idle accounting psi: Add a missing SPDX license header psi: Remove repeated verbose comment
2 parents 01367e8 + 82762d2 commit 6ae7143

File tree

14 files changed

+343
-181
lines changed

14 files changed

+343
-181
lines changed

include/linux/psi.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
12
#ifndef _LINUX_PSI_H
23
#define _LINUX_PSI_H
34

include/linux/psi_types.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
12
#ifndef _LINUX_PSI_TYPES_H
23
#define _LINUX_PSI_TYPES_H
34

@@ -21,14 +22,25 @@ enum psi_task_count {
2122
* don't have to special case any state tracking for it.
2223
*/
2324
NR_ONCPU,
24-
NR_PSI_TASK_COUNTS = 4,
25+
/*
26+
* For IO and CPU stalls the presence of running/oncpu tasks
27+
* in the domain means a partial rather than a full stall.
28+
* For memory it's not so simple because of page reclaimers:
29+
* they are running/oncpu while representing a stall. To tell
30+
* whether a domain has productivity left or not, we need to
31+
* distinguish between regular running (i.e. productive)
32+
* threads and memstall ones.
33+
*/
34+
NR_MEMSTALL_RUNNING,
35+
NR_PSI_TASK_COUNTS = 5,
2536
};
2637

2738
/* Task state bitmasks */
2839
#define TSK_IOWAIT (1 << NR_IOWAIT)
2940
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
3041
#define TSK_RUNNING (1 << NR_RUNNING)
3142
#define TSK_ONCPU (1 << NR_ONCPU)
43+
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
3244

3345
/* Resources that workloads could be stalled on */
3446
enum psi_res {

include/linux/sched.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -523,7 +523,11 @@ struct sched_statistics {
523523
u64 nr_wakeups_affine_attempts;
524524
u64 nr_wakeups_passive;
525525
u64 nr_wakeups_idle;
526+
527+
#ifdef CONFIG_SCHED_CORE
528+
u64 core_forceidle_sum;
526529
#endif
530+
#endif /* CONFIG_SCHEDSTATS */
527531
} ____cacheline_aligned;
528532

529533
struct sched_entity {

kernel/sched/core.c

Lines changed: 63 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct *
144144
return false;
145145

146146
/* flip prio, so high prio is leftmost */
147-
if (prio_less(b, a, task_rq(a)->core->core_forceidle))
147+
if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count))
148148
return true;
149149

150150
return false;
@@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
181181
rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less);
182182
}
183183

184-
void sched_core_dequeue(struct rq *rq, struct task_struct *p)
184+
void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
185185
{
186186
rq->core->core_task_seq++;
187187

188-
if (!sched_core_enqueued(p))
189-
return;
188+
if (sched_core_enqueued(p)) {
189+
rb_erase(&p->core_node, &rq->core_tree);
190+
RB_CLEAR_NODE(&p->core_node);
191+
}
190192

191-
rb_erase(&p->core_node, &rq->core_tree);
192-
RB_CLEAR_NODE(&p->core_node);
193+
/*
194+
* Migrating the last task off the cpu, with the cpu in forced idle
195+
* state. Reschedule to create an accounting edge for forced idle,
196+
* and re-examine whether the core is still in forced idle state.
197+
*/
198+
if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
199+
rq->core->core_forceidle_count && rq->curr == rq->idle)
200+
resched_curr(rq);
193201
}
194202

195203
/*
@@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled)
280288
for_each_cpu(t, smt_mask)
281289
cpu_rq(t)->core_enabled = enabled;
282290

291+
cpu_rq(cpu)->core->core_forceidle_start = 0;
292+
283293
sched_core_unlock(cpu, &flags);
284294

285295
cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
@@ -364,7 +374,8 @@ void sched_core_put(void)
364374
#else /* !CONFIG_SCHED_CORE */
365375

366376
static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
367-
static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
377+
static inline void
378+
sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
368379

369380
#endif /* CONFIG_SCHED_CORE */
370381

@@ -2005,7 +2016,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
20052016
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
20062017
{
20072018
if (sched_core_enabled(rq))
2008-
sched_core_dequeue(rq, p);
2019+
sched_core_dequeue(rq, p, flags);
20092020

20102021
if (!(flags & DEQUEUE_NOCLOCK))
20112022
update_rq_clock(rq);
@@ -5244,6 +5255,7 @@ void scheduler_tick(void)
52445255
if (sched_feat(LATENCY_WARN))
52455256
resched_latency = cpu_resched_latency(rq);
52465257
calc_global_load_tick(rq);
5258+
sched_core_tick(rq);
52475259

52485260
rq_unlock(rq, &rf);
52495261

@@ -5656,6 +5668,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
56565668
struct task_struct *next, *p, *max = NULL;
56575669
const struct cpumask *smt_mask;
56585670
bool fi_before = false;
5671+
bool core_clock_updated = (rq == rq->core);
56595672
unsigned long cookie;
56605673
int i, cpu, occ = 0;
56615674
struct rq *rq_i;
@@ -5708,10 +5721,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
57085721

57095722
/* reset state */
57105723
rq->core->core_cookie = 0UL;
5711-
if (rq->core->core_forceidle) {
5724+
if (rq->core->core_forceidle_count) {
5725+
if (!core_clock_updated) {
5726+
update_rq_clock(rq->core);
5727+
core_clock_updated = true;
5728+
}
5729+
sched_core_account_forceidle(rq);
5730+
/* reset after accounting force idle */
5731+
rq->core->core_forceidle_start = 0;
5732+
rq->core->core_forceidle_count = 0;
5733+
rq->core->core_forceidle_occupation = 0;
57125734
need_sync = true;
57135735
fi_before = true;
5714-
rq->core->core_forceidle = false;
57155736
}
57165737

57175738
/*
@@ -5753,7 +5774,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
57535774
for_each_cpu_wrap(i, smt_mask, cpu) {
57545775
rq_i = cpu_rq(i);
57555776

5756-
if (i != cpu)
5777+
/*
5778+
* Current cpu always has its clock updated on entrance to
5779+
* pick_next_task(). If the current cpu is not the core,
5780+
* the core may also have been updated above.
5781+
*/
5782+
if (i != cpu && (rq_i != rq->core || !core_clock_updated))
57575783
update_rq_clock(rq_i);
57585784

57595785
p = rq_i->core_pick = pick_task(rq_i);
@@ -5783,7 +5809,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
57835809

57845810
if (p == rq_i->idle) {
57855811
if (rq_i->nr_running) {
5786-
rq->core->core_forceidle = true;
5812+
rq->core->core_forceidle_count++;
57875813
if (!fi_before)
57885814
rq->core->core_forceidle_seq++;
57895815
}
@@ -5792,6 +5818,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
57925818
}
57935819
}
57945820

5821+
if (schedstat_enabled() && rq->core->core_forceidle_count) {
5822+
if (cookie)
5823+
rq->core->core_forceidle_start = rq_clock(rq->core);
5824+
rq->core->core_forceidle_occupation = occ;
5825+
}
5826+
57955827
rq->core->core_pick_seq = rq->core->core_task_seq;
57965828
next = rq->core_pick;
57975829
rq->core_sched_seq = rq->core->core_pick_seq;
@@ -5828,8 +5860,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
58285860
* 1 0 1
58295861
* 1 1 0
58305862
*/
5831-
if (!(fi_before && rq->core->core_forceidle))
5832-
task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle);
5863+
if (!(fi_before && rq->core->core_forceidle_count))
5864+
task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count);
58335865

58345866
rq_i->core_pick->core_occupation = occ;
58355867

@@ -6033,11 +6065,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu)
60336065
goto unlock;
60346066

60356067
/* copy the shared state to the new leader */
6036-
core_rq->core_task_seq = rq->core_task_seq;
6037-
core_rq->core_pick_seq = rq->core_pick_seq;
6038-
core_rq->core_cookie = rq->core_cookie;
6039-
core_rq->core_forceidle = rq->core_forceidle;
6040-
core_rq->core_forceidle_seq = rq->core_forceidle_seq;
6068+
core_rq->core_task_seq = rq->core_task_seq;
6069+
core_rq->core_pick_seq = rq->core_pick_seq;
6070+
core_rq->core_cookie = rq->core_cookie;
6071+
core_rq->core_forceidle_count = rq->core_forceidle_count;
6072+
core_rq->core_forceidle_seq = rq->core_forceidle_seq;
6073+
core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
6074+
6075+
/*
6076+
* Accounting edge for forced idle is handled in pick_next_task().
6077+
* Don't need another one here, since the hotplug thread shouldn't
6078+
* have a cookie.
6079+
*/
6080+
core_rq->core_forceidle_start = 0;
60416081

60426082
/* install new leader */
60436083
for_each_cpu(t, smt_mask) {
@@ -7126,7 +7166,7 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
71267166

71277167
unsigned long sched_cpu_util(int cpu, unsigned long max)
71287168
{
7129-
return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max,
7169+
return effective_cpu_util(cpu, cpu_util_cfs(cpu), max,
71307170
ENERGY_UTIL, NULL);
71317171
}
71327172
#endif /* CONFIG_SMP */
@@ -9409,7 +9449,9 @@ void __init sched_init(void)
94099449
rq->core_pick = NULL;
94109450
rq->core_enabled = 0;
94119451
rq->core_tree = RB_ROOT;
9412-
rq->core_forceidle = false;
9452+
rq->core_forceidle_count = 0;
9453+
rq->core_forceidle_occupation = 0;
9454+
rq->core_forceidle_start = 0;
94139455

94149456
rq->core_cookie = 0UL;
94159457
#endif

kernel/sched/core_sched.c

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
7373

7474
enqueued = sched_core_enqueued(p);
7575
if (enqueued)
76-
sched_core_dequeue(rq, p);
76+
sched_core_dequeue(rq, p, DEQUEUE_SAVE);
7777

7878
old_cookie = p->core_cookie;
7979
p->core_cookie = cookie;
@@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
8585
* If task is currently running, it may not be compatible anymore after
8686
* the cookie change, so enter the scheduler on its CPU to schedule it
8787
* away.
88+
*
89+
* Note that it is possible that as a result of this cookie change, the
90+
* core has now entered/left forced idle state. Defer accounting to the
91+
* next scheduling edge, rather than always forcing a reschedule here.
8892
*/
8993
if (task_running(rq, p))
9094
resched_curr(rq);
@@ -232,3 +236,63 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
232236
return err;
233237
}
234238

239+
#ifdef CONFIG_SCHEDSTATS
240+
241+
/* REQUIRES: rq->core's clock recently updated. */
242+
void __sched_core_account_forceidle(struct rq *rq)
243+
{
244+
const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
245+
u64 delta, now = rq_clock(rq->core);
246+
struct rq *rq_i;
247+
struct task_struct *p;
248+
int i;
249+
250+
lockdep_assert_rq_held(rq);
251+
252+
WARN_ON_ONCE(!rq->core->core_forceidle_count);
253+
254+
if (rq->core->core_forceidle_start == 0)
255+
return;
256+
257+
delta = now - rq->core->core_forceidle_start;
258+
if (unlikely((s64)delta <= 0))
259+
return;
260+
261+
rq->core->core_forceidle_start = now;
262+
263+
if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) {
264+
/* can't be forced idle without a running task */
265+
} else if (rq->core->core_forceidle_count > 1 ||
266+
rq->core->core_forceidle_occupation > 1) {
267+
/*
268+
* For larger SMT configurations, we need to scale the charged
269+
* forced idle amount since there can be more than one forced
270+
* idle sibling and more than one running cookied task.
271+
*/
272+
delta *= rq->core->core_forceidle_count;
273+
delta = div_u64(delta, rq->core->core_forceidle_occupation);
274+
}
275+
276+
for_each_cpu(i, smt_mask) {
277+
rq_i = cpu_rq(i);
278+
p = rq_i->core_pick ?: rq_i->curr;
279+
280+
if (!p->core_cookie)
281+
continue;
282+
283+
__schedstat_add(p->stats.core_forceidle_sum, delta);
284+
}
285+
}
286+
287+
void __sched_core_tick(struct rq *rq)
288+
{
289+
if (!rq->core->core_forceidle_count)
290+
return;
291+
292+
if (rq != rq->core)
293+
update_rq_clock(rq->core);
294+
295+
__sched_core_account_forceidle(rq);
296+
}
297+
298+
#endif /* CONFIG_SCHEDSTATS */

0 commit comments

Comments
 (0)