Skip to content

Commit 3d4b47b

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
sched: Implement on-demand (active) cfs_rq list
Make certain load-balance actions scale per number of active cgroups instead of the number of existing cgroups. This makes wakeup/sleep paths more expensive, but is a win for systems where the vast majority of existing cgroups are idle. Signed-off-by: Paul Turner <[email protected]> Signed-off-by: Peter Zijlstra <[email protected]> LKML-Reference: <[email protected]> Signed-off-by: Ingo Molnar <[email protected]>
1 parent 2069dd7 commit 3d4b47b

File tree

3 files changed

+92
-83
lines changed

3 files changed

+92
-83
lines changed

kernel/sched.c

Lines changed: 28 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -274,9 +274,7 @@ struct task_group {
274274

275275
#define root_task_group init_task_group
276276

277-
/* task_group_lock serializes add/remove of task groups and also changes to
278-
* a task group's cpu shares.
279-
*/
277+
/* task_group_lock serializes the addition/removal of task groups */
280278
static DEFINE_SPINLOCK(task_group_lock);
281279

282280
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -344,6 +342,7 @@ struct cfs_rq {
344342
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
345343
* list is used during load balance.
346344
*/
345+
int on_list;
347346
struct list_head leaf_cfs_rq_list;
348347
struct task_group *tg; /* group that "owns" this runqueue */
349348

@@ -1547,7 +1546,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
15471546

15481547
#ifdef CONFIG_FAIR_GROUP_SCHED
15491548

1550-
static void update_cfs_load(struct cfs_rq *cfs_rq);
1549+
static void update_cfs_load(struct cfs_rq *cfs_rq, int lb);
15511550
static void update_cfs_shares(struct cfs_rq *cfs_rq);
15521551

15531552
/*
@@ -1570,7 +1569,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
15701569
raw_spin_lock_irqsave(&rq->lock, flags);
15711570

15721571
update_rq_clock(rq);
1573-
update_cfs_load(cfs_rq);
1572+
update_cfs_load(cfs_rq, 1);
15741573

15751574
load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
15761575
load_avg -= cfs_rq->load_contribution;
@@ -7688,15 +7687,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
76887687

76897688
#ifdef CONFIG_FAIR_GROUP_SCHED
76907689
static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7691-
struct sched_entity *se, int cpu, int add,
7690+
struct sched_entity *se, int cpu,
76927691
struct sched_entity *parent)
76937692
{
76947693
struct rq *rq = cpu_rq(cpu);
76957694
tg->cfs_rq[cpu] = cfs_rq;
76967695
init_cfs_rq(cfs_rq, rq);
76977696
cfs_rq->tg = tg;
7698-
if (add)
7699-
list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
77007697

77017698
tg->se[cpu] = se;
77027699
/* se could be NULL for init_task_group */
@@ -7716,7 +7713,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
77167713

77177714
#ifdef CONFIG_RT_GROUP_SCHED
77187715
static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7719-
struct sched_rt_entity *rt_se, int cpu, int add,
7716+
struct sched_rt_entity *rt_se, int cpu,
77207717
struct sched_rt_entity *parent)
77217718
{
77227719
struct rq *rq = cpu_rq(cpu);
@@ -7725,8 +7722,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
77257722
init_rt_rq(rt_rq, rq);
77267723
rt_rq->tg = tg;
77277724
rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7728-
if (add)
7729-
list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
77307725

77317726
tg->rt_se[cpu] = rt_se;
77327727
if (!rt_se)
@@ -7835,15 +7830,15 @@ void __init sched_init(void)
78357830
* We achieve this by letting init_task_group's tasks sit
78367831
* directly in rq->cfs (i.e init_task_group->se[] = NULL).
78377832
*/
7838-
init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
7833+
init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
78397834
#endif
78407835
#endif /* CONFIG_FAIR_GROUP_SCHED */
78417836

78427837
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
78437838
#ifdef CONFIG_RT_GROUP_SCHED
78447839
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
78457840
#ifdef CONFIG_CGROUP_SCHED
7846-
init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
7841+
init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
78477842
#endif
78487843
#endif
78497844

@@ -8119,7 +8114,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
81198114
if (!se)
81208115
goto err_free_rq;
81218116

8122-
init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
8117+
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
81238118
}
81248119

81258120
return 1;
@@ -8130,15 +8125,22 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
81308125
return 0;
81318126
}
81328127

8133-
static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8134-
{
8135-
list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8136-
&cpu_rq(cpu)->leaf_cfs_rq_list);
8137-
}
8138-
81398128
static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
81408129
{
8141-
list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8130+
struct rq *rq = cpu_rq(cpu);
8131+
unsigned long flags;
8132+
int i;
8133+
8134+
/*
8135+
* Only empty task groups can be destroyed; so we can speculatively
8136+
* check on_list without danger of it being re-added.
8137+
*/
8138+
if (!tg->cfs_rq[cpu]->on_list)
8139+
return;
8140+
8141+
raw_spin_lock_irqsave(&rq->lock, flags);
8142+
list_del_leaf_cfs_rq(tg->cfs_rq[i]);
8143+
raw_spin_unlock_irqrestore(&rq->lock, flags);
81428144
}
81438145
#else /* !CONFG_FAIR_GROUP_SCHED */
81448146
static inline void free_fair_sched_group(struct task_group *tg)
@@ -8151,10 +8153,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
81518153
return 1;
81528154
}
81538155

8154-
static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8155-
{
8156-
}
8157-
81588156
static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
81598157
{
81608158
}
@@ -8209,7 +8207,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
82098207
if (!rt_se)
82108208
goto err_free_rq;
82118209

8212-
init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
8210+
init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
82138211
}
82148212

82158213
return 1;
@@ -8219,17 +8217,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
82198217
err:
82208218
return 0;
82218219
}
8222-
8223-
static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8224-
{
8225-
list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8226-
&cpu_rq(cpu)->leaf_rt_rq_list);
8227-
}
8228-
8229-
static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8230-
{
8231-
list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8232-
}
82338220
#else /* !CONFIG_RT_GROUP_SCHED */
82348221
static inline void free_rt_sched_group(struct task_group *tg)
82358222
{
@@ -8240,14 +8227,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
82408227
{
82418228
return 1;
82428229
}
8243-
8244-
static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8245-
{
8246-
}
8247-
8248-
static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8249-
{
8250-
}
82518230
#endif /* CONFIG_RT_GROUP_SCHED */
82528231

82538232
#ifdef CONFIG_CGROUP_SCHED
@@ -8263,7 +8242,6 @@ struct task_group *sched_create_group(struct task_group *parent)
82638242
{
82648243
struct task_group *tg;
82658244
unsigned long flags;
8266-
int i;
82678245

82688246
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
82698247
if (!tg)
@@ -8276,10 +8254,6 @@ struct task_group *sched_create_group(struct task_group *parent)
82768254
goto err;
82778255

82788256
spin_lock_irqsave(&task_group_lock, flags);
8279-
for_each_possible_cpu(i) {
8280-
register_fair_sched_group(tg, i);
8281-
register_rt_sched_group(tg, i);
8282-
}
82838257
list_add_rcu(&tg->list, &task_groups);
82848258

82858259
WARN_ON(!parent); /* root should already exist */
@@ -8309,11 +8283,11 @@ void sched_destroy_group(struct task_group *tg)
83098283
unsigned long flags;
83108284
int i;
83118285

8312-
spin_lock_irqsave(&task_group_lock, flags);
8313-
for_each_possible_cpu(i) {
8286+
/* end participation in shares distribution */
8287+
for_each_possible_cpu(i)
83148288
unregister_fair_sched_group(tg, i);
8315-
unregister_rt_sched_group(tg, i);
8316-
}
8289+
8290+
spin_lock_irqsave(&task_group_lock, flags);
83178291
list_del_rcu(&tg->list);
83188292
list_del_rcu(&tg->siblings);
83198293
spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8391,7 +8365,6 @@ static DEFINE_MUTEX(shares_mutex);
83918365
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
83928366
{
83938367
int i;
8394-
unsigned long flags;
83958368

83968369
/*
83978370
* We can't change the weight of the root cgroup.
@@ -8408,19 +8381,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
84088381
if (tg->shares == shares)
84098382
goto done;
84108383

8411-
spin_lock_irqsave(&task_group_lock, flags);
8412-
for_each_possible_cpu(i)
8413-
unregister_fair_sched_group(tg, i);
8414-
list_del_rcu(&tg->siblings);
8415-
spin_unlock_irqrestore(&task_group_lock, flags);
8416-
8417-
/* wait for any ongoing reference to this group to finish */
8418-
synchronize_sched();
8419-
8420-
/*
8421-
* Now we are free to modify the group's share on each cpu
8422-
* w/o tripping rebalance_share or load_balance_fair.
8423-
*/
84248384
tg->shares = shares;
84258385
for_each_possible_cpu(i) {
84268386
/*
@@ -8429,15 +8389,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
84298389
set_se_shares(tg->se[i], shares);
84308390
}
84318391

8432-
/*
8433-
* Enable load balance activity on this group, by inserting it back on
8434-
* each cpu's rq->leaf_cfs_rq_list.
8435-
*/
8436-
spin_lock_irqsave(&task_group_lock, flags);
8437-
for_each_possible_cpu(i)
8438-
register_fair_sched_group(tg, i);
8439-
list_add_rcu(&tg->siblings, &tg->parent->children);
8440-
spin_unlock_irqrestore(&task_group_lock, flags);
84418392
done:
84428393
mutex_unlock(&shares_mutex);
84438394
return 0;

kernel/sched_fair.c

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,24 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
143143
return cfs_rq->tg->cfs_rq[this_cpu];
144144
}
145145

146+
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
147+
{
148+
if (!cfs_rq->on_list) {
149+
list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
150+
&rq_of(cfs_rq)->leaf_cfs_rq_list);
151+
152+
cfs_rq->on_list = 1;
153+
}
154+
}
155+
156+
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
157+
{
158+
if (cfs_rq->on_list) {
159+
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
160+
cfs_rq->on_list = 0;
161+
}
162+
}
163+
146164
/* Iterate thr' all leaf cfs_rq's on a runqueue */
147165
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
148166
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +264,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
246264
return &cpu_rq(this_cpu)->cfs;
247265
}
248266

267+
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
268+
{
269+
}
270+
271+
static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
272+
{
273+
}
274+
249275
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
250276
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
251277

@@ -648,7 +674,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
648674
}
649675

650676
#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
651-
static void update_cfs_load(struct cfs_rq *cfs_rq)
677+
static void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
652678
{
653679
u64 period = sched_avg_period();
654680
u64 now, delta;
@@ -673,6 +699,11 @@ static void update_cfs_load(struct cfs_rq *cfs_rq)
673699
cfs_rq->load_period /= 2;
674700
cfs_rq->load_avg /= 2;
675701
}
702+
703+
if (lb && !cfs_rq->nr_running) {
704+
if (cfs_rq->load_avg < (period / 8))
705+
list_del_leaf_cfs_rq(cfs_rq);
706+
}
676707
}
677708

678709
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
@@ -719,7 +750,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
719750
reweight_entity(cfs_rq_of(se), se, shares);
720751
}
721752
#else /* CONFIG_FAIR_GROUP_SCHED */
722-
static inline void update_cfs_load(struct cfs_rq *cfs_rq)
753+
static inline void update_cfs_load(struct cfs_rq *cfs_rq, int lb)
723754
{
724755
}
725756

@@ -849,7 +880,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
849880
* Update run-time statistics of the 'current'.
850881
*/
851882
update_curr(cfs_rq);
852-
update_cfs_load(cfs_rq);
883+
update_cfs_load(cfs_rq, 0);
853884
account_entity_enqueue(cfs_rq, se);
854885
update_cfs_shares(cfs_rq);
855886

@@ -863,6 +894,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
863894
if (se != cfs_rq->curr)
864895
__enqueue_entity(cfs_rq, se);
865896
se->on_rq = 1;
897+
898+
if (cfs_rq->nr_running == 1)
899+
list_add_leaf_cfs_rq(cfs_rq);
866900
}
867901

868902
static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -907,7 +941,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
907941
if (se != cfs_rq->curr)
908942
__dequeue_entity(cfs_rq, se);
909943
se->on_rq = 0;
910-
update_cfs_load(cfs_rq);
944+
update_cfs_load(cfs_rq, 0);
911945
account_entity_dequeue(cfs_rq, se);
912946
update_min_vruntime(cfs_rq);
913947
update_cfs_shares(cfs_rq);
@@ -1142,7 +1176,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
11421176
for_each_sched_entity(se) {
11431177
struct cfs_rq *cfs_rq = cfs_rq_of(se);
11441178

1145-
update_cfs_load(cfs_rq);
1179+
update_cfs_load(cfs_rq, 0);
11461180
update_cfs_shares(cfs_rq);
11471181
}
11481182

@@ -1172,7 +1206,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
11721206
for_each_sched_entity(se) {
11731207
struct cfs_rq *cfs_rq = cfs_rq_of(se);
11741208

1175-
update_cfs_load(cfs_rq);
1209+
update_cfs_load(cfs_rq, 0);
11761210
update_cfs_shares(cfs_rq);
11771211
}
11781212

0 commit comments

Comments
 (0)