@@ -128,6 +128,26 @@ static inline unsigned char swap_count(unsigned char ent)
128128 return ent & ~SWAP_HAS_CACHE ; /* may include COUNT_CONTINUED flag */
129129}
130130
131+ /*
132+ * Use the second highest bit of inuse_pages counter as the indicator
133+ * if one swap device is on the available plist, so the atomic can
134+ * still be updated arithmetically while having special data embedded.
135+ *
136+ * inuse_pages counter is the only thing indicating if a device should
137+ * be on avail_lists or not (except swapon / swapoff). By embedding the
138+ * off-list bit in the atomic counter, updates no longer need any lock
139+ * to check the list status.
140+ *
141+ * This bit will be set if the device is not on the plist and not
142+ * usable, will be cleared if the device is on the plist.
143+ */
144+ #define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2))
145+ #define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT)
146+ static long swap_usage_in_pages (struct swap_info_struct * si )
147+ {
148+ return atomic_long_read (& si -> inuse_pages ) & SWAP_USAGE_COUNTER_MASK ;
149+ }
150+
131151/* Reclaim the swap entry anyway if possible */
132152#define TTRS_ANYWAY 0x1
133153/*
@@ -717,7 +737,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
717737 int nr_reclaim ;
718738
719739 if (force )
720- to_scan = si -> inuse_pages / SWAPFILE_CLUSTER ;
740+ to_scan = swap_usage_in_pages ( si ) / SWAPFILE_CLUSTER ;
721741
722742 while (!list_empty (& si -> full_clusters )) {
723743 ci = list_first_entry (& si -> full_clusters , struct swap_cluster_info , list );
@@ -872,42 +892,128 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
872892 return found ;
873893}
874894
875- static void __del_from_avail_list (struct swap_info_struct * si )
895+ /* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
896+ static void del_from_avail_list (struct swap_info_struct * si , bool swapoff )
876897{
877898 int nid ;
899+ unsigned long pages ;
900+
901+ spin_lock (& swap_avail_lock );
902+
903+ if (swapoff ) {
904+ /*
905+ * Forcefully remove it. Clear the SWP_WRITEOK flags for
906+ * swapoff here so it's synchronized by both si->lock and
907+ * swap_avail_lock, to ensure the result can be seen by
908+ * add_to_avail_list.
909+ */
910+ lockdep_assert_held (& si -> lock );
911+ si -> flags &= ~SWP_WRITEOK ;
912+ atomic_long_or (SWAP_USAGE_OFFLIST_BIT , & si -> inuse_pages );
913+ } else {
914+ /*
915+ * If not called by swapoff, take it off-list only if it's
916+ * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly
917+ * si->inuse_pages == pages), any concurrent slot freeing,
918+ * or device already removed from plist by someone else
919+ * will make this return false.
920+ */
921+ pages = si -> pages ;
922+ if (!atomic_long_try_cmpxchg (& si -> inuse_pages , & pages ,
923+ pages | SWAP_USAGE_OFFLIST_BIT ))
924+ goto skip ;
925+ }
878926
879- assert_spin_locked (& si -> lock );
880927 for_each_node (nid )
881928 plist_del (& si -> avail_lists [nid ], & swap_avail_heads [nid ]);
929+
930+ skip :
931+ spin_unlock (& swap_avail_lock );
882932}
883933
884- static void del_from_avail_list (struct swap_info_struct * si )
934+ /* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
935+ static void add_to_avail_list (struct swap_info_struct * si , bool swapon )
885936{
937+ int nid ;
938+ long val ;
939+ unsigned long pages ;
940+
886941 spin_lock (& swap_avail_lock );
887- __del_from_avail_list (si );
942+
943+ /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */
944+ if (swapon ) {
945+ lockdep_assert_held (& si -> lock );
946+ si -> flags |= SWP_WRITEOK ;
947+ } else {
948+ if (!(READ_ONCE (si -> flags ) & SWP_WRITEOK ))
949+ goto skip ;
950+ }
951+
952+ if (!(atomic_long_read (& si -> inuse_pages ) & SWAP_USAGE_OFFLIST_BIT ))
953+ goto skip ;
954+
955+ val = atomic_long_fetch_and_relaxed (~SWAP_USAGE_OFFLIST_BIT , & si -> inuse_pages );
956+
957+ /*
958+ * When device is full and device is on the plist, only one updater will
959+ * see (inuse_pages == si->pages) and will call del_from_avail_list. If
960+ * that updater happen to be here, just skip adding.
961+ */
962+ pages = si -> pages ;
963+ if (val == pages ) {
964+ /* Just like the cmpxchg in del_from_avail_list */
965+ if (atomic_long_try_cmpxchg (& si -> inuse_pages , & pages ,
966+ pages | SWAP_USAGE_OFFLIST_BIT ))
967+ goto skip ;
968+ }
969+
970+ for_each_node (nid )
971+ plist_add (& si -> avail_lists [nid ], & swap_avail_heads [nid ]);
972+
973+ skip :
888974 spin_unlock (& swap_avail_lock );
889975}
890976
891- static void swap_range_alloc (struct swap_info_struct * si ,
892- unsigned int nr_entries )
977+ /*
978+ * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock
979+ * within each cluster, so the total contribution to the global counter should
980+ * always be positive and cannot exceed the total number of usable slots.
981+ */
982+ static bool swap_usage_add (struct swap_info_struct * si , unsigned int nr_entries )
893983{
894- WRITE_ONCE (si -> inuse_pages , si -> inuse_pages + nr_entries );
895- if (si -> inuse_pages == si -> pages ) {
896- del_from_avail_list (si );
984+ long val = atomic_long_add_return_relaxed (nr_entries , & si -> inuse_pages );
897985
898- if (vm_swap_full ())
899- schedule_work (& si -> reclaim_work );
986+ /*
987+ * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set,
988+ * remove it from the plist.
989+ */
990+ if (unlikely (val == si -> pages )) {
991+ del_from_avail_list (si , false);
992+ return true;
900993 }
994+
995+ return false;
901996}
902997
903- static void add_to_avail_list (struct swap_info_struct * si )
998+ static void swap_usage_sub (struct swap_info_struct * si , unsigned int nr_entries )
904999{
905- int nid ;
1000+ long val = atomic_long_sub_return_relaxed ( nr_entries , & si -> inuse_pages ) ;
9061001
907- spin_lock (& swap_avail_lock );
908- for_each_node (nid )
909- plist_add (& si -> avail_lists [nid ], & swap_avail_heads [nid ]);
910- spin_unlock (& swap_avail_lock );
1002+ /*
1003+ * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set,
1004+ * remove it from the plist.
1005+ */
1006+ if (unlikely (val & SWAP_USAGE_OFFLIST_BIT ))
1007+ add_to_avail_list (si , false);
1008+ }
1009+
1010+ static void swap_range_alloc (struct swap_info_struct * si ,
1011+ unsigned int nr_entries )
1012+ {
1013+ if (swap_usage_add (si , nr_entries )) {
1014+ if (vm_swap_full ())
1015+ schedule_work (& si -> reclaim_work );
1016+ }
9111017}
9121018
9131019static void swap_range_free (struct swap_info_struct * si , unsigned long offset ,
@@ -925,8 +1031,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
9251031 for (i = 0 ; i < nr_entries ; i ++ )
9261032 clear_bit (offset + i , si -> zeromap );
9271033
928- if (si -> inuse_pages == si -> pages )
929- add_to_avail_list (si );
9301034 if (si -> flags & SWP_BLKDEV )
9311035 swap_slot_free_notify =
9321036 si -> bdev -> bd_disk -> fops -> swap_slot_free_notify ;
@@ -946,7 +1050,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
9461050 */
9471051 smp_wmb ();
9481052 atomic_long_add (nr_entries , & nr_swap_pages );
949- WRITE_ONCE (si -> inuse_pages , si -> inuse_pages - nr_entries );
1053+ swap_usage_sub (si , nr_entries );
9501054}
9511055
9521056static int cluster_alloc_swap (struct swap_info_struct * si ,
@@ -1036,19 +1140,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
10361140 plist_requeue (& si -> avail_lists [node ], & swap_avail_heads [node ]);
10371141 spin_unlock (& swap_avail_lock );
10381142 spin_lock (& si -> lock );
1039- if ((si -> inuse_pages == si -> pages ) || !(si -> flags & SWP_WRITEOK )) {
1040- spin_lock (& swap_avail_lock );
1041- if (plist_node_empty (& si -> avail_lists [node ])) {
1042- spin_unlock (& si -> lock );
1043- goto nextsi ;
1044- }
1045- WARN (!(si -> flags & SWP_WRITEOK ),
1046- "swap_info %d in list but !SWP_WRITEOK\n" ,
1047- si -> type );
1048- __del_from_avail_list (si );
1049- spin_unlock (& si -> lock );
1050- goto nextsi ;
1051- }
10521143 n_ret = scan_swap_map_slots (si , SWAP_HAS_CACHE ,
10531144 n_goal , swp_entries , order );
10541145 spin_unlock (& si -> lock );
@@ -1057,7 +1148,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
10571148 cond_resched ();
10581149
10591150 spin_lock (& swap_avail_lock );
1060- nextsi :
10611151 /*
10621152 * if we got here, it's likely that si was almost full before,
10631153 * and since scan_swap_map_slots() can drop the si->lock,
@@ -1789,7 +1879,7 @@ unsigned int count_swap_pages(int type, int free)
17891879 if (sis -> flags & SWP_WRITEOK ) {
17901880 n = sis -> pages ;
17911881 if (free )
1792- n -= sis -> inuse_pages ;
1882+ n -= swap_usage_in_pages ( sis ) ;
17931883 }
17941884 spin_unlock (& sis -> lock );
17951885 }
@@ -2124,7 +2214,7 @@ static int try_to_unuse(unsigned int type)
21242214 swp_entry_t entry ;
21252215 unsigned int i ;
21262216
2127- if (!READ_ONCE (si -> inuse_pages ))
2217+ if (!swap_usage_in_pages (si ))
21282218 goto success ;
21292219
21302220retry :
@@ -2137,7 +2227,7 @@ static int try_to_unuse(unsigned int type)
21372227
21382228 spin_lock (& mmlist_lock );
21392229 p = & init_mm .mmlist ;
2140- while (READ_ONCE (si -> inuse_pages ) &&
2230+ while (swap_usage_in_pages (si ) &&
21412231 !signal_pending (current ) &&
21422232 (p = p -> next ) != & init_mm .mmlist ) {
21432233
@@ -2165,7 +2255,7 @@ static int try_to_unuse(unsigned int type)
21652255 mmput (prev_mm );
21662256
21672257 i = 0 ;
2168- while (READ_ONCE (si -> inuse_pages ) &&
2258+ while (swap_usage_in_pages (si ) &&
21692259 !signal_pending (current ) &&
21702260 (i = find_next_to_unuse (si , i )) != 0 ) {
21712261
@@ -2200,7 +2290,7 @@ static int try_to_unuse(unsigned int type)
22002290 * folio_alloc_swap(), temporarily hiding that swap. It's easy
22012291 * and robust (though cpu-intensive) just to keep retrying.
22022292 */
2203- if (READ_ONCE (si -> inuse_pages )) {
2293+ if (swap_usage_in_pages (si )) {
22042294 if (!signal_pending (current ))
22052295 goto retry ;
22062296 return - EINTR ;
@@ -2227,7 +2317,7 @@ static void drain_mmlist(void)
22272317 unsigned int type ;
22282318
22292319 for (type = 0 ; type < nr_swapfiles ; type ++ )
2230- if (swap_info [type ]-> inuse_pages )
2320+ if (swap_usage_in_pages ( swap_info [type ]) )
22312321 return ;
22322322 spin_lock (& mmlist_lock );
22332323 list_for_each_safe (p , next , & init_mm .mmlist )
@@ -2406,7 +2496,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio,
24062496
24072497static void _enable_swap_info (struct swap_info_struct * si )
24082498{
2409- si -> flags |= SWP_WRITEOK ;
24102499 atomic_long_add (si -> pages , & nr_swap_pages );
24112500 total_swap_pages += si -> pages ;
24122501
@@ -2423,9 +2512,8 @@ static void _enable_swap_info(struct swap_info_struct *si)
24232512 */
24242513 plist_add (& si -> list , & swap_active_head );
24252514
2426- /* add to available list if swap device is not full */
2427- if (si -> inuse_pages < si -> pages )
2428- add_to_avail_list (si );
2515+ /* Add back to available list */
2516+ add_to_avail_list (si , true);
24292517}
24302518
24312519static void enable_swap_info (struct swap_info_struct * si , int prio ,
@@ -2523,7 +2611,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
25232611 goto out_dput ;
25242612 }
25252613 spin_lock (& p -> lock );
2526- del_from_avail_list (p );
2614+ del_from_avail_list (p , true );
25272615 if (p -> prio < 0 ) {
25282616 struct swap_info_struct * si = p ;
25292617 int nid ;
@@ -2541,7 +2629,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
25412629 plist_del (& p -> list , & swap_active_head );
25422630 atomic_long_sub (p -> pages , & nr_swap_pages );
25432631 total_swap_pages -= p -> pages ;
2544- p -> flags &= ~SWP_WRITEOK ;
25452632 spin_unlock (& p -> lock );
25462633 spin_unlock (& swap_lock );
25472634
@@ -2721,7 +2808,7 @@ static int swap_show(struct seq_file *swap, void *v)
27212808 }
27222809
27232810 bytes = K (si -> pages );
2724- inuse = K (READ_ONCE (si -> inuse_pages ));
2811+ inuse = K (swap_usage_in_pages (si ));
27252812
27262813 file = si -> swap_file ;
27272814 len = seq_file_path (swap , file , " \t\n\\" );
@@ -2838,6 +2925,7 @@ static struct swap_info_struct *alloc_swap_info(void)
28382925 }
28392926 spin_lock_init (& p -> lock );
28402927 spin_lock_init (& p -> cont_lock );
2928+ atomic_long_set (& p -> inuse_pages , SWAP_USAGE_OFFLIST_BIT );
28412929 init_completion (& p -> comp );
28422930
28432931 return p ;
@@ -3335,7 +3423,7 @@ void si_swapinfo(struct sysinfo *val)
33353423 struct swap_info_struct * si = swap_info [type ];
33363424
33373425 if ((si -> flags & SWP_USED ) && !(si -> flags & SWP_WRITEOK ))
3338- nr_to_be_unused += READ_ONCE (si -> inuse_pages );
3426+ nr_to_be_unused += swap_usage_in_pages (si );
33393427 }
33403428 val -> freeswap = atomic_long_read (& nr_swap_pages ) + nr_to_be_unused ;
33413429 val -> totalswap = total_swap_pages + nr_to_be_unused ;
0 commit comments