Skip to content

Commit 2069dd7

Browse files
Peter ZijlstraIngo Molnar
authored andcommitted
sched: Rewrite tg_shares_up)
By tracking a per-cpu load-avg for each cfs_rq and folding it into a global task_group load on each tick we can rework tg_shares_up to be strictly per-cpu. This should improve cpu-cgroup performance for smp systems significantly. [ Paul: changed to use queueing cfs_rq + bug fixes ] Signed-off-by: Paul Turner <[email protected]> Signed-off-by: Peter Zijlstra <[email protected]> LKML-Reference: <[email protected]> Signed-off-by: Ingo Molnar <[email protected]>
1 parent 48c5cca commit 2069dd7

File tree

6 files changed

+162
-213
lines changed

6 files changed

+162
-213
lines changed

include/linux/sched.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1885,8 +1885,6 @@ static inline void wake_up_idle_cpu(int cpu) { }
18851885
extern unsigned int sysctl_sched_latency;
18861886
extern unsigned int sysctl_sched_min_granularity;
18871887
extern unsigned int sysctl_sched_wakeup_granularity;
1888-
extern unsigned int sysctl_sched_shares_ratelimit;
1889-
extern unsigned int sysctl_sched_shares_thresh;
18901888
extern unsigned int sysctl_sched_child_runs_first;
18911889

18921890
enum sched_tunable_scaling {

kernel/sched.c

Lines changed: 44 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,8 @@ struct task_group {
253253
/* runqueue "owned" by this group on each cpu */
254254
struct cfs_rq **cfs_rq;
255255
unsigned long shares;
256+
257+
atomic_t load_weight;
256258
#endif
257259

258260
#ifdef CONFIG_RT_GROUP_SCHED
@@ -359,15 +361,11 @@ struct cfs_rq {
359361
*/
360362
unsigned long h_load;
361363

362-
/*
363-
* this cpu's part of tg->shares
364-
*/
365-
unsigned long shares;
364+
u64 load_avg;
365+
u64 load_period;
366+
u64 load_stamp;
366367

367-
/*
368-
* load.weight at the time we set shares
369-
*/
370-
unsigned long rq_weight;
368+
unsigned long load_contribution;
371369
#endif
372370
#endif
373371
};
@@ -806,20 +804,6 @@ late_initcall(sched_init_debug);
806804
*/
807805
const_debug unsigned int sysctl_sched_nr_migrate = 32;
808806

809-
/*
810-
* ratelimit for updating the group shares.
811-
* default: 0.25ms
812-
*/
813-
unsigned int sysctl_sched_shares_ratelimit = 250000;
814-
unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
815-
816-
/*
817-
* Inject some fuzzyness into changing the per-cpu group shares
818-
* this avoids remote rq-locks at the expense of fairness.
819-
* default: 4
820-
*/
821-
unsigned int sysctl_sched_shares_thresh = 4;
822-
823807
/*
824808
* period over which we average the RT time consumption, measured
825809
* in ms.
@@ -1369,6 +1353,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
13691353
lw->inv_weight = 0;
13701354
}
13711355

1356+
static inline void update_load_set(struct load_weight *lw, unsigned long w)
1357+
{
1358+
lw->weight = w;
1359+
lw->inv_weight = 0;
1360+
}
1361+
13721362
/*
13731363
* To aid in avoiding the subversion of "niceness" due to uneven distribution
13741364
* of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1557,97 +1547,44 @@ static unsigned long cpu_avg_load_per_task(int cpu)
15571547

15581548
#ifdef CONFIG_FAIR_GROUP_SCHED
15591549

1560-
static __read_mostly unsigned long __percpu *update_shares_data;
1561-
1562-
static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1563-
1564-
/*
1565-
* Calculate and set the cpu's group shares.
1566-
*/
1567-
static void update_group_shares_cpu(struct task_group *tg, int cpu,
1568-
unsigned long sd_shares,
1569-
unsigned long sd_rq_weight,
1570-
unsigned long *usd_rq_weight)
1571-
{
1572-
unsigned long shares, rq_weight;
1573-
int boost = 0;
1574-
1575-
rq_weight = usd_rq_weight[cpu];
1576-
if (!rq_weight) {
1577-
boost = 1;
1578-
rq_weight = NICE_0_LOAD;
1579-
}
1580-
1581-
/*
1582-
* \Sum_j shares_j * rq_weight_i
1583-
* shares_i = -----------------------------
1584-
* \Sum_j rq_weight_j
1585-
*/
1586-
shares = (sd_shares * rq_weight) / sd_rq_weight;
1587-
shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1588-
1589-
if (abs(shares - tg->se[cpu]->load.weight) >
1590-
sysctl_sched_shares_thresh) {
1591-
struct rq *rq = cpu_rq(cpu);
1592-
unsigned long flags;
1593-
1594-
raw_spin_lock_irqsave(&rq->lock, flags);
1595-
tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1596-
tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1597-
__set_se_shares(tg->se[cpu], shares);
1598-
raw_spin_unlock_irqrestore(&rq->lock, flags);
1599-
}
1600-
}
1550+
static void update_cfs_load(struct cfs_rq *cfs_rq);
1551+
static void update_cfs_shares(struct cfs_rq *cfs_rq);
16011552

16021553
/*
1603-
* Re-compute the task group their per cpu shares over the given domain.
1604-
* This needs to be done in a bottom-up fashion because the rq weight of a
1605-
* parent group depends on the shares of its child groups.
1554+
* update tg->load_weight by folding this cpu's load_avg
16061555
*/
16071556
static int tg_shares_up(struct task_group *tg, void *data)
16081557
{
1609-
unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1610-
unsigned long *usd_rq_weight;
1611-
struct sched_domain *sd = data;
1558+
long load_avg;
1559+
struct cfs_rq *cfs_rq;
16121560
unsigned long flags;
1613-
int i;
1561+
int cpu = (long)data;
1562+
struct rq *rq;
16141563

1615-
if (!tg->se[0])
1564+
if (!tg->se[cpu])
16161565
return 0;
16171566

1618-
local_irq_save(flags);
1619-
usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1620-
1621-
for_each_cpu(i, sched_domain_span(sd)) {
1622-
weight = tg->cfs_rq[i]->load.weight;
1623-
usd_rq_weight[i] = weight;
1624-
1625-
rq_weight += weight;
1626-
/*
1627-
* If there are currently no tasks on the cpu pretend there
1628-
* is one of average load so that when a new task gets to
1629-
* run here it will not get delayed by group starvation.
1630-
*/
1631-
if (!weight)
1632-
weight = NICE_0_LOAD;
1567+
rq = cpu_rq(cpu);
1568+
cfs_rq = tg->cfs_rq[cpu];
16331569

1634-
sum_weight += weight;
1635-
shares += tg->cfs_rq[i]->shares;
1636-
}
1570+
raw_spin_lock_irqsave(&rq->lock, flags);
16371571

1638-
if (!rq_weight)
1639-
rq_weight = sum_weight;
1572+
update_rq_clock(rq);
1573+
update_cfs_load(cfs_rq);
16401574

1641-
if ((!shares && rq_weight) || shares > tg->shares)
1642-
shares = tg->shares;
1575+
load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
1576+
load_avg -= cfs_rq->load_contribution;
16431577

1644-
if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1645-
shares = tg->shares;
1578+
atomic_add(load_avg, &tg->load_weight);
1579+
cfs_rq->load_contribution += load_avg;
16461580

1647-
for_each_cpu(i, sched_domain_span(sd))
1648-
update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1581+
/*
1582+
* We need to update shares after updating tg->load_weight in
1583+
* order to adjust the weight of groups with long running tasks.
1584+
*/
1585+
update_cfs_shares(cfs_rq);
16491586

1650-
local_irq_restore(flags);
1587+
raw_spin_unlock_irqrestore(&rq->lock, flags);
16511588

16521589
return 0;
16531590
}
@@ -1666,7 +1603,7 @@ static int tg_load_down(struct task_group *tg, void *data)
16661603
load = cpu_rq(cpu)->load.weight;
16671604
} else {
16681605
load = tg->parent->cfs_rq[cpu]->h_load;
1669-
load *= tg->cfs_rq[cpu]->shares;
1606+
load *= tg->se[cpu]->load.weight;
16701607
load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
16711608
}
16721609

@@ -1675,21 +1612,16 @@ static int tg_load_down(struct task_group *tg, void *data)
16751612
return 0;
16761613
}
16771614

1678-
static void update_shares(struct sched_domain *sd)
1615+
static void update_shares(long cpu)
16791616
{
1680-
s64 elapsed;
1681-
u64 now;
1682-
16831617
if (root_task_group_empty())
16841618
return;
16851619

1686-
now = local_clock();
1687-
elapsed = now - sd->last_update;
1620+
/*
1621+
* XXX: replace with an on-demand list
1622+
*/
16881623

1689-
if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1690-
sd->last_update = now;
1691-
walk_tg_tree(tg_nop, tg_shares_up, sd);
1692-
}
1624+
walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu);
16931625
}
16941626

16951627
static void update_h_load(long cpu)
@@ -1699,7 +1631,7 @@ static void update_h_load(long cpu)
16991631

17001632
#else
17011633

1702-
static inline void update_shares(struct sched_domain *sd)
1634+
static inline void update_shares(int cpu)
17031635
{
17041636
}
17051637

@@ -1824,15 +1756,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
18241756

18251757
#endif
18261758

1827-
#ifdef CONFIG_FAIR_GROUP_SCHED
1828-
static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1829-
{
1830-
#ifdef CONFIG_SMP
1831-
cfs_rq->shares = shares;
1832-
#endif
1833-
}
1834-
#endif
1835-
18361759
static void calc_load_account_idle(struct rq *this_rq);
18371760
static void update_sysctl(void);
18381761
static int get_update_sysctl_factor(void);
@@ -5551,7 +5474,6 @@ static void update_sysctl(void)
55515474
SET_SYSCTL(sched_min_granularity);
55525475
SET_SYSCTL(sched_latency);
55535476
SET_SYSCTL(sched_wakeup_granularity);
5554-
SET_SYSCTL(sched_shares_ratelimit);
55555477
#undef SET_SYSCTL
55565478
}
55575479

@@ -7787,8 +7709,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
77877709
se->cfs_rq = parent->my_q;
77887710

77897711
se->my_q = cfs_rq;
7790-
se->load.weight = tg->shares;
7791-
se->load.inv_weight = 0;
7712+
update_load_set(&se->load, tg->shares);
77927713
se->parent = parent;
77937714
}
77947715
#endif
@@ -7881,10 +7802,6 @@ void __init sched_init(void)
78817802

78827803
#endif /* CONFIG_CGROUP_SCHED */
78837804

7884-
#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7885-
update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
7886-
__alignof__(unsigned long));
7887-
#endif
78887805
for_each_possible_cpu(i) {
78897806
struct rq *rq;
78907807

@@ -8452,8 +8369,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
84528369
if (on_rq)
84538370
dequeue_entity(cfs_rq, se, 0);
84548371

8455-
se->load.weight = shares;
8456-
se->load.inv_weight = 0;
8372+
update_load_set(&se->load, shares);
84578373

84588374
if (on_rq)
84598375
enqueue_entity(cfs_rq, se, 0);
@@ -8510,7 +8426,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
85108426
/*
85118427
* force a rebalance
85128428
*/
8513-
cfs_rq_set_shares(tg->cfs_rq[i], 0);
85148429
set_se_shares(tg->se[i], shares);
85158430
}
85168431

kernel/sched_debug.c

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -202,15 +202,22 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202202
spread0 = min_vruntime - rq0_min_vruntime;
203203
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
204204
SPLIT_NS(spread0));
205-
SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
206-
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207-
208205
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
209206
cfs_rq->nr_spread_over);
207+
SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
208+
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
210209
#ifdef CONFIG_FAIR_GROUP_SCHED
211210
#ifdef CONFIG_SMP
212-
SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
211+
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
212+
SPLIT_NS(cfs_rq->load_avg));
213+
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
214+
SPLIT_NS(cfs_rq->load_period));
215+
SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
216+
cfs_rq->load_contribution);
217+
SEQ_printf(m, " .%-30s: %d\n", "load_tg",
218+
atomic_read(&tg->load_weight));
213219
#endif
220+
214221
print_cfs_group_stats(m, cpu, cfs_rq->tg);
215222
#endif
216223
}

0 commit comments

Comments
 (0)