7878#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
7979#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
8080#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
81+ #define PCPU_EMPTY_POP_PAGES_LOW 2
82+ #define PCPU_EMPTY_POP_PAGES_HIGH 4
8183
8284#ifdef CONFIG_SMP
8385/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
@@ -168,9 +170,22 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
168170 */
169171static int pcpu_nr_empty_pop_pages ;
170172
171- /* balance work is used to populate or destroy chunks asynchronously */
173+ /*
174+ * Balance work is used to populate or destroy chunks asynchronously. We
175+ * try to keep the number of populated free pages between
176+ * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
177+ * empty chunk.
178+ */
172179static void pcpu_balance_workfn (struct work_struct * work );
173180static DECLARE_WORK (pcpu_balance_work , pcpu_balance_workfn );
181+ static bool pcpu_async_enabled __read_mostly ;
182+ static bool pcpu_atomic_alloc_failed ;
183+
184+ static void pcpu_schedule_balance_work (void )
185+ {
186+ if (pcpu_async_enabled )
187+ schedule_work (& pcpu_balance_work );
188+ }
174189
175190static bool pcpu_addr_in_first_chunk (void * addr )
176191{
@@ -386,7 +401,8 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
386401 margin = 3 ;
387402
388403 if (chunk -> map_alloc <
389- chunk -> map_used + PCPU_ATOMIC_MAP_MARGIN_LOW )
404+ chunk -> map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
405+ pcpu_async_enabled )
390406 schedule_work (& chunk -> map_extend_work );
391407 } else {
392408 margin = PCPU_ATOMIC_MAP_MARGIN_HIGH ;
@@ -1005,6 +1021,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
10051021 if (chunk != pcpu_reserved_chunk )
10061022 pcpu_nr_empty_pop_pages -= occ_pages ;
10071023
1024+ if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW )
1025+ pcpu_schedule_balance_work ();
1026+
10081027 /* clear the areas and return address relative to base address */
10091028 for_each_possible_cpu (cpu )
10101029 memset ((void * )pcpu_chunk_addr (chunk , cpu , 0 ) + off , 0 , size );
@@ -1023,6 +1042,11 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
10231042 if (!-- warn_limit )
10241043 pr_info ("PERCPU: limit reached, disable warning\n" );
10251044 }
1045+ if (is_atomic ) {
1046+ /* see the flag handling in pcpu_blance_workfn() */
1047+ pcpu_atomic_alloc_failed = true;
1048+ pcpu_schedule_balance_work ();
1049+ }
10261050 return NULL ;
10271051}
10281052
@@ -1080,7 +1104,7 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
10801104}
10811105
10821106/**
1083- * pcpu_balance_workfn - reclaim fully free chunks, workqueue function
1107+ * pcpu_balance_workfn - manage the amount of free chunks and populated pages
10841108 * @work: unused
10851109 *
10861110 * Reclaim all fully free chunks except for the first one.
@@ -1090,7 +1114,12 @@ static void pcpu_balance_workfn(struct work_struct *work)
10901114 LIST_HEAD (to_free );
10911115 struct list_head * free_head = & pcpu_slot [pcpu_nr_slots - 1 ];
10921116 struct pcpu_chunk * chunk , * next ;
1117+ int slot , nr_to_pop , ret ;
10931118
1119+ /*
1120+ * There's no reason to keep around multiple unused chunks and VM
1121+ * areas can be scarce. Destroy all free chunks except for one.
1122+ */
10941123 mutex_lock (& pcpu_alloc_mutex );
10951124 spin_lock_irq (& pcpu_lock );
10961125
@@ -1118,6 +1147,74 @@ static void pcpu_balance_workfn(struct work_struct *work)
11181147 pcpu_destroy_chunk (chunk );
11191148 }
11201149
1150+ /*
1151+ * Ensure there are certain number of free populated pages for
1152+ * atomic allocs. Fill up from the most packed so that atomic
1153+ * allocs don't increase fragmentation. If atomic allocation
1154+ * failed previously, always populate the maximum amount. This
1155+ * should prevent atomic allocs larger than PAGE_SIZE from keeping
1156+ * failing indefinitely; however, large atomic allocs are not
1157+ * something we support properly and can be highly unreliable and
1158+ * inefficient.
1159+ */
1160+ retry_pop :
1161+ if (pcpu_atomic_alloc_failed ) {
1162+ nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH ;
1163+ /* best effort anyway, don't worry about synchronization */
1164+ pcpu_atomic_alloc_failed = false;
1165+ } else {
1166+ nr_to_pop = clamp (PCPU_EMPTY_POP_PAGES_HIGH -
1167+ pcpu_nr_empty_pop_pages ,
1168+ 0 , PCPU_EMPTY_POP_PAGES_HIGH );
1169+ }
1170+
1171+ for (slot = pcpu_size_to_slot (PAGE_SIZE ); slot < pcpu_nr_slots ; slot ++ ) {
1172+ int nr_unpop = 0 , rs , re ;
1173+
1174+ if (!nr_to_pop )
1175+ break ;
1176+
1177+ spin_lock_irq (& pcpu_lock );
1178+ list_for_each_entry (chunk , & pcpu_slot [slot ], list ) {
1179+ nr_unpop = pcpu_unit_pages - chunk -> nr_populated ;
1180+ if (nr_unpop )
1181+ break ;
1182+ }
1183+ spin_unlock_irq (& pcpu_lock );
1184+
1185+ if (!nr_unpop )
1186+ continue ;
1187+
1188+ /* @chunk can't go away while pcpu_alloc_mutex is held */
1189+ pcpu_for_each_unpop_region (chunk , rs , re , 0 , pcpu_unit_pages ) {
1190+ int nr = min (re - rs , nr_to_pop );
1191+
1192+ ret = pcpu_populate_chunk (chunk , rs , rs + nr );
1193+ if (!ret ) {
1194+ nr_to_pop -= nr ;
1195+ spin_lock_irq (& pcpu_lock );
1196+ pcpu_chunk_populated (chunk , rs , rs + nr );
1197+ spin_unlock_irq (& pcpu_lock );
1198+ } else {
1199+ nr_to_pop = 0 ;
1200+ }
1201+
1202+ if (!nr_to_pop )
1203+ break ;
1204+ }
1205+ }
1206+
1207+ if (nr_to_pop ) {
1208+ /* ran out of chunks to populate, create a new one and retry */
1209+ chunk = pcpu_create_chunk ();
1210+ if (chunk ) {
1211+ spin_lock_irq (& pcpu_lock );
1212+ pcpu_chunk_relocate (chunk , -1 );
1213+ spin_unlock_irq (& pcpu_lock );
1214+ goto retry_pop ;
1215+ }
1216+ }
1217+
11211218 mutex_unlock (& pcpu_alloc_mutex );
11221219}
11231220
@@ -1160,7 +1257,7 @@ void free_percpu(void __percpu *ptr)
11601257
11611258 list_for_each_entry (pos , & pcpu_slot [pcpu_nr_slots - 1 ], list )
11621259 if (pos != chunk ) {
1163- schedule_work ( & pcpu_balance_work );
1260+ pcpu_schedule_balance_work ( );
11641261 break ;
11651262 }
11661263 }
@@ -2187,3 +2284,15 @@ void __init percpu_init_late(void)
21872284 spin_unlock_irqrestore (& pcpu_lock , flags );
21882285 }
21892286}
2287+
2288+ /*
2289+ * Percpu allocator is initialized early during boot when neither slab or
2290+ * workqueue is available. Plug async management until everything is up
2291+ * and running.
2292+ */
2293+ static int __init percpu_enable_async (void )
2294+ {
2295+ pcpu_async_enabled = true;
2296+ return 0 ;
2297+ }
2298+ subsys_initcall (percpu_enable_async );
0 commit comments