@@ -253,6 +253,8 @@ struct task_group {
253253 /* runqueue "owned" by this group on each cpu */
254254 struct cfs_rq * * cfs_rq ;
255255 unsigned long shares ;
256+
257+ atomic_t load_weight ;
256258#endif
257259
258260#ifdef CONFIG_RT_GROUP_SCHED
@@ -359,15 +361,11 @@ struct cfs_rq {
359361 */
360362 unsigned long h_load ;
361363
362- /*
363- * this cpu's part of tg->shares
364- */
365- unsigned long shares ;
364+ u64 load_avg ;
365+ u64 load_period ;
366+ u64 load_stamp ;
366367
367- /*
368- * load.weight at the time we set shares
369- */
370- unsigned long rq_weight ;
368+ unsigned long load_contribution ;
371369#endif
372370#endif
373371};
@@ -806,20 +804,6 @@ late_initcall(sched_init_debug);
806804 */
807805const_debug unsigned int sysctl_sched_nr_migrate = 32 ;
808806
809- /*
810- * ratelimit for updating the group shares.
811- * default: 0.25ms
812- */
813- unsigned int sysctl_sched_shares_ratelimit = 250000 ;
814- unsigned int normalized_sysctl_sched_shares_ratelimit = 250000 ;
815-
816- /*
817- * Inject some fuzzyness into changing the per-cpu group shares
818- * this avoids remote rq-locks at the expense of fairness.
819- * default: 4
820- */
821- unsigned int sysctl_sched_shares_thresh = 4 ;
822-
823807/*
824808 * period over which we average the RT time consumption, measured
825809 * in ms.
@@ -1369,6 +1353,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
13691353 lw -> inv_weight = 0 ;
13701354}
13711355
1356+ static inline void update_load_set (struct load_weight * lw , unsigned long w )
1357+ {
1358+ lw -> weight = w ;
1359+ lw -> inv_weight = 0 ;
1360+ }
1361+
13721362/*
13731363 * To aid in avoiding the subversion of "niceness" due to uneven distribution
13741364 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1557,97 +1547,44 @@ static unsigned long cpu_avg_load_per_task(int cpu)
15571547
15581548#ifdef CONFIG_FAIR_GROUP_SCHED
15591549
1560- static __read_mostly unsigned long __percpu * update_shares_data ;
1561-
1562- static void __set_se_shares (struct sched_entity * se , unsigned long shares );
1563-
1564- /*
1565- * Calculate and set the cpu's group shares.
1566- */
1567- static void update_group_shares_cpu (struct task_group * tg , int cpu ,
1568- unsigned long sd_shares ,
1569- unsigned long sd_rq_weight ,
1570- unsigned long * usd_rq_weight )
1571- {
1572- unsigned long shares , rq_weight ;
1573- int boost = 0 ;
1574-
1575- rq_weight = usd_rq_weight [cpu ];
1576- if (!rq_weight ) {
1577- boost = 1 ;
1578- rq_weight = NICE_0_LOAD ;
1579- }
1580-
1581- /*
1582- * \Sum_j shares_j * rq_weight_i
1583- * shares_i = -----------------------------
1584- * \Sum_j rq_weight_j
1585- */
1586- shares = (sd_shares * rq_weight ) / sd_rq_weight ;
1587- shares = clamp_t (unsigned long , shares , MIN_SHARES , MAX_SHARES );
1588-
1589- if (abs (shares - tg -> se [cpu ]-> load .weight ) >
1590- sysctl_sched_shares_thresh ) {
1591- struct rq * rq = cpu_rq (cpu );
1592- unsigned long flags ;
1593-
1594- raw_spin_lock_irqsave (& rq -> lock , flags );
1595- tg -> cfs_rq [cpu ]-> rq_weight = boost ? 0 : rq_weight ;
1596- tg -> cfs_rq [cpu ]-> shares = boost ? 0 : shares ;
1597- __set_se_shares (tg -> se [cpu ], shares );
1598- raw_spin_unlock_irqrestore (& rq -> lock , flags );
1599- }
1600- }
1550+ static void update_cfs_load (struct cfs_rq * cfs_rq );
1551+ static void update_cfs_shares (struct cfs_rq * cfs_rq );
16011552
16021553/*
1603- * Re-compute the task group their per cpu shares over the given domain.
1604- * This needs to be done in a bottom-up fashion because the rq weight of a
1605- * parent group depends on the shares of its child groups.
1554+ * update tg->load_weight by folding this cpu's load_avg
16061555 */
16071556static int tg_shares_up (struct task_group * tg , void * data )
16081557{
1609- unsigned long weight , rq_weight = 0 , sum_weight = 0 , shares = 0 ;
1610- unsigned long * usd_rq_weight ;
1611- struct sched_domain * sd = data ;
1558+ long load_avg ;
1559+ struct cfs_rq * cfs_rq ;
16121560 unsigned long flags ;
1613- int i ;
1561+ int cpu = (long )data ;
1562+ struct rq * rq ;
16141563
1615- if (!tg -> se [0 ])
1564+ if (!tg -> se [cpu ])
16161565 return 0 ;
16171566
1618- local_irq_save (flags );
1619- usd_rq_weight = per_cpu_ptr (update_shares_data , smp_processor_id ());
1620-
1621- for_each_cpu (i , sched_domain_span (sd )) {
1622- weight = tg -> cfs_rq [i ]-> load .weight ;
1623- usd_rq_weight [i ] = weight ;
1624-
1625- rq_weight += weight ;
1626- /*
1627- * If there are currently no tasks on the cpu pretend there
1628- * is one of average load so that when a new task gets to
1629- * run here it will not get delayed by group starvation.
1630- */
1631- if (!weight )
1632- weight = NICE_0_LOAD ;
1567+ rq = cpu_rq (cpu );
1568+ cfs_rq = tg -> cfs_rq [cpu ];
16331569
1634- sum_weight += weight ;
1635- shares += tg -> cfs_rq [i ]-> shares ;
1636- }
1570+ raw_spin_lock_irqsave (& rq -> lock , flags );
16371571
1638- if (! rq_weight )
1639- rq_weight = sum_weight ;
1572+ update_rq_clock ( rq );
1573+ update_cfs_load ( cfs_rq ) ;
16401574
1641- if ((! shares && rq_weight ) || shares > tg -> shares )
1642- shares = tg -> shares ;
1575+ load_avg = div64_u64 ( cfs_rq -> load_avg , cfs_rq -> load_period + 1 );
1576+ load_avg -= cfs_rq -> load_contribution ;
16431577
1644- if (! sd -> parent || !( sd -> parent -> flags & SD_LOAD_BALANCE ))
1645- shares = tg -> shares ;
1578+ atomic_add ( load_avg , & tg -> load_weight );
1579+ cfs_rq -> load_contribution += load_avg ;
16461580
1647- for_each_cpu (i , sched_domain_span (sd ))
1648- update_group_shares_cpu (tg , i , shares , rq_weight , usd_rq_weight );
1581+ /*
1582+ * We need to update shares after updating tg->load_weight in
1583+ * order to adjust the weight of groups with long running tasks.
1584+ */
1585+ update_cfs_shares (cfs_rq );
16491586
1650- local_irq_restore ( flags );
1587+ raw_spin_unlock_irqrestore ( & rq -> lock , flags );
16511588
16521589 return 0 ;
16531590}
@@ -1666,7 +1603,7 @@ static int tg_load_down(struct task_group *tg, void *data)
16661603 load = cpu_rq (cpu )-> load .weight ;
16671604 } else {
16681605 load = tg -> parent -> cfs_rq [cpu ]-> h_load ;
1669- load *= tg -> cfs_rq [cpu ]-> shares ;
1606+ load *= tg -> se [cpu ]-> load . weight ;
16701607 load /= tg -> parent -> cfs_rq [cpu ]-> load .weight + 1 ;
16711608 }
16721609
@@ -1675,21 +1612,16 @@ static int tg_load_down(struct task_group *tg, void *data)
16751612 return 0 ;
16761613}
16771614
1678- static void update_shares (struct sched_domain * sd )
1615+ static void update_shares (long cpu )
16791616{
1680- s64 elapsed ;
1681- u64 now ;
1682-
16831617 if (root_task_group_empty ())
16841618 return ;
16851619
1686- now = local_clock ();
1687- elapsed = now - sd -> last_update ;
1620+ /*
1621+ * XXX: replace with an on-demand list
1622+ */
16881623
1689- if (elapsed >= (s64 )(u64 )sysctl_sched_shares_ratelimit ) {
1690- sd -> last_update = now ;
1691- walk_tg_tree (tg_nop , tg_shares_up , sd );
1692- }
1624+ walk_tg_tree (tg_nop , tg_shares_up , (void * )cpu );
16931625}
16941626
16951627static void update_h_load (long cpu )
@@ -1699,7 +1631,7 @@ static void update_h_load(long cpu)
16991631
17001632#else
17011633
1702- static inline void update_shares (struct sched_domain * sd )
1634+ static inline void update_shares (int cpu )
17031635{
17041636}
17051637
@@ -1824,15 +1756,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
18241756
18251757#endif
18261758
1827- #ifdef CONFIG_FAIR_GROUP_SCHED
1828- static void cfs_rq_set_shares (struct cfs_rq * cfs_rq , unsigned long shares )
1829- {
1830- #ifdef CONFIG_SMP
1831- cfs_rq -> shares = shares ;
1832- #endif
1833- }
1834- #endif
1835-
18361759static void calc_load_account_idle (struct rq * this_rq );
18371760static void update_sysctl (void );
18381761static int get_update_sysctl_factor (void );
@@ -5551,7 +5474,6 @@ static void update_sysctl(void)
55515474 SET_SYSCTL (sched_min_granularity );
55525475 SET_SYSCTL (sched_latency );
55535476 SET_SYSCTL (sched_wakeup_granularity );
5554- SET_SYSCTL (sched_shares_ratelimit );
55555477#undef SET_SYSCTL
55565478}
55575479
@@ -7787,8 +7709,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
77877709 se -> cfs_rq = parent -> my_q ;
77887710
77897711 se -> my_q = cfs_rq ;
7790- se -> load .weight = tg -> shares ;
7791- se -> load .inv_weight = 0 ;
7712+ update_load_set (& se -> load , tg -> shares );
77927713 se -> parent = parent ;
77937714}
77947715#endif
@@ -7881,10 +7802,6 @@ void __init sched_init(void)
78817802
78827803#endif /* CONFIG_CGROUP_SCHED */
78837804
7884- #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7885- update_shares_data = __alloc_percpu (nr_cpu_ids * sizeof (unsigned long ),
7886- __alignof__(unsigned long ));
7887- #endif
78887805 for_each_possible_cpu (i ) {
78897806 struct rq * rq ;
78907807
@@ -8452,8 +8369,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
84528369 if (on_rq )
84538370 dequeue_entity (cfs_rq , se , 0 );
84548371
8455- se -> load .weight = shares ;
8456- se -> load .inv_weight = 0 ;
8372+ update_load_set (& se -> load , shares );
84578373
84588374 if (on_rq )
84598375 enqueue_entity (cfs_rq , se , 0 );
@@ -8510,7 +8426,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
85108426 /*
85118427 * force a rebalance
85128428 */
8513- cfs_rq_set_shares (tg -> cfs_rq [i ], 0 );
85148429 set_se_shares (tg -> se [i ], shares );
85158430 }
85168431
0 commit comments