@@ -932,10 +932,11 @@ struct numa_group {
932932 spinlock_t lock ; /* nr_tasks, tasks */
933933 int nr_tasks ;
934934 pid_t gid ;
935+ int active_nodes ;
935936
936937 struct rcu_head rcu ;
937- nodemask_t active_nodes ;
938938 unsigned long total_faults ;
939+ unsigned long max_faults_cpu ;
939940 /*
940941 * Faults_cpu is used to decide whether memory should move
941942 * towards the CPU. As a consequence, these stats are weighted
@@ -994,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
994995 group -> faults_cpu [task_faults_idx (NUMA_MEM , nid , 1 )];
995996}
996997
998+ /*
999+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
1000+ * considered part of a numa group's pseudo-interleaving set. Migrations
1001+ * between these nodes are slowed down, to allow things to settle down.
1002+ */
1003+ #define ACTIVE_NODE_FRACTION 3
1004+
1005+ static bool numa_is_active_node (int nid , struct numa_group * ng )
1006+ {
1007+ return group_faults_cpu (ng , nid ) * ACTIVE_NODE_FRACTION > ng -> max_faults_cpu ;
1008+ }
1009+
9971010/* Handle placement on systems where not all nodes are directly connected. */
9981011static unsigned long score_nearby_nodes (struct task_struct * p , int nid ,
9991012 int maxdist , bool task )
@@ -1143,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
11431156 return true;
11441157
11451158 /*
1146- * Do not migrate if the destination is not a node that
1147- * is actively used by this numa group .
1159+ * Destination node is much more heavily used than the source
1160+ * node? Allow migration .
11481161 */
1149- if (!node_isset (dst_nid , ng -> active_nodes ))
1150- return false;
1151-
1152- /*
1153- * Source is a node that is not actively used by this
1154- * numa group, while the destination is. Migrate.
1155- */
1156- if (!node_isset (src_nid , ng -> active_nodes ))
1162+ if (group_faults_cpu (ng , dst_nid ) > group_faults_cpu (ng , src_nid ) *
1163+ ACTIVE_NODE_FRACTION )
11571164 return true;
11581165
11591166 /*
1160- * Both source and destination are nodes in active
1161- * use by this numa group. Maximize memory bandwidth
1162- * by migrating from more heavily used groups, to less
1163- * heavily used ones, spreading the load around.
1164- * Use a 1/4 hysteresis to avoid spurious page movement.
1167+ * Distribute memory according to CPU & memory use on each node,
1168+ * with 3/4 hysteresis to avoid unnecessary memory migrations:
1169+ *
1170+ * faults_cpu(dst) 3 faults_cpu(src)
1171+ * --------------- * - > ---------------
1172+ * faults_mem(dst) 4 faults_mem(src)
11651173 */
1166- return group_faults (p , dst_nid ) < (group_faults (p , src_nid ) * 3 / 4 );
1174+ return group_faults_cpu (ng , dst_nid ) * group_faults (p , src_nid ) * 3 >
1175+ group_faults_cpu (ng , src_nid ) * group_faults (p , dst_nid ) * 4 ;
11671176}
11681177
11691178static unsigned long weighted_cpuload (const int cpu );
@@ -1509,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)
15091518
15101519 .best_task = NULL ,
15111520 .best_imp = 0 ,
1512- .best_cpu = -1
1521+ .best_cpu = -1 ,
15131522 };
15141523 struct sched_domain * sd ;
15151524 unsigned long taskweight , groupweight ;
@@ -1561,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)
15611570 * multiple NUMA nodes; in order to better consolidate the group,
15621571 * we need to check other locations.
15631572 */
1564- if (env .best_cpu == -1 || (p -> numa_group &&
1565- nodes_weight (p -> numa_group -> active_nodes ) > 1 )) {
1573+ if (env .best_cpu == -1 || (p -> numa_group && p -> numa_group -> active_nodes > 1 )) {
15661574 for_each_online_node (nid ) {
15671575 if (nid == env .src_nid || nid == p -> numa_preferred_nid )
15681576 continue ;
@@ -1597,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)
15971605 * trying for a better one later. Do not set the preferred node here.
15981606 */
15991607 if (p -> numa_group ) {
1608+ struct numa_group * ng = p -> numa_group ;
1609+
16001610 if (env .best_cpu == -1 )
16011611 nid = env .src_nid ;
16021612 else
16031613 nid = env .dst_nid ;
16041614
1605- if (node_isset ( nid , p -> numa_group -> active_nodes ))
1615+ if (ng -> active_nodes > 1 && numa_is_active_node ( env . dst_nid , ng ))
16061616 sched_setnuma (p , env .dst_nid );
16071617 }
16081618
@@ -1652,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)
16521662}
16531663
16541664/*
1655- * Find the nodes on which the workload is actively running. We do this by
1665+ * Find out how many nodes on the workload is actively running on. Do this by
16561666 * tracking the nodes from which NUMA hinting faults are triggered. This can
16571667 * be different from the set of nodes where the workload's memory is currently
16581668 * located.
1659- *
1660- * The bitmask is used to make smarter decisions on when to do NUMA page
1661- * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1662- * are added when they cause over 6/16 of the maximum number of faults, but
1663- * only removed when they drop below 3/16.
16641669 */
1665- static void update_numa_active_node_mask (struct numa_group * numa_group )
1670+ static void numa_group_count_active_nodes (struct numa_group * numa_group )
16661671{
16671672 unsigned long faults , max_faults = 0 ;
1668- int nid ;
1673+ int nid , active_nodes = 0 ;
16691674
16701675 for_each_online_node (nid ) {
16711676 faults = group_faults_cpu (numa_group , nid );
@@ -1675,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
16751680
16761681 for_each_online_node (nid ) {
16771682 faults = group_faults_cpu (numa_group , nid );
1678- if (!node_isset (nid , numa_group -> active_nodes )) {
1679- if (faults > max_faults * 6 / 16 )
1680- node_set (nid , numa_group -> active_nodes );
1681- } else if (faults < max_faults * 3 / 16 )
1682- node_clear (nid , numa_group -> active_nodes );
1683+ if (faults * ACTIVE_NODE_FRACTION > max_faults )
1684+ active_nodes ++ ;
16831685 }
1686+
1687+ numa_group -> max_faults_cpu = max_faults ;
1688+ numa_group -> active_nodes = active_nodes ;
16841689}
16851690
16861691/*
@@ -1971,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)
19711976 update_task_scan_period (p , fault_types [0 ], fault_types [1 ]);
19721977
19731978 if (p -> numa_group ) {
1974- update_numa_active_node_mask (p -> numa_group );
1979+ numa_group_count_active_nodes (p -> numa_group );
19751980 spin_unlock_irq (group_lock );
19761981 max_nid = preferred_group_nid (p , max_group_nid );
19771982 }
@@ -2015,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
20152020 return ;
20162021
20172022 atomic_set (& grp -> refcount , 1 );
2023+ grp -> active_nodes = 1 ;
2024+ grp -> max_faults_cpu = 0 ;
20182025 spin_lock_init (& grp -> lock );
20192026 grp -> gid = p -> pid ;
20202027 /* Second half of the array tracks nids where faults happen */
20212028 grp -> faults_cpu = grp -> faults + NR_NUMA_HINT_FAULT_TYPES *
20222029 nr_node_ids ;
20232030
2024- node_set (task_node (current ), grp -> active_nodes );
2025-
20262031 for (i = 0 ; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids ; i ++ )
20272032 grp -> faults [i ] = p -> numa_faults [i ];
20282033
@@ -2136,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
21362141 bool migrated = flags & TNF_MIGRATED ;
21372142 int cpu_node = task_node (current );
21382143 int local = !!(flags & TNF_FAULT_LOCAL );
2144+ struct numa_group * ng ;
21392145 int priv ;
21402146
21412147 if (!static_branch_likely (& sched_numa_balancing ))
@@ -2176,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
21762182 * actively using should be counted as local. This allows the
21772183 * scan rate to slow down when a workload has settled down.
21782184 */
2179- if (!priv && !local && p -> numa_group &&
2180- node_isset (cpu_node , p -> numa_group -> active_nodes ) &&
2181- node_isset (mem_node , p -> numa_group -> active_nodes ))
2185+ ng = p -> numa_group ;
2186+ if (!priv && !local && ng && ng -> active_nodes > 1 &&
2187+ numa_is_active_node (cpu_node , ng ) &&
2188+ numa_is_active_node (mem_node , ng ))
21822189 local = 1 ;
21832190
21842191 task_numa_placement (p );
0 commit comments