Skip to content

Commit b228386

Browse files
ryncsnakpm00
authored andcommitted
mm, swap: clean up plist removal and adding
When the swap device is full (inuse_pages == pages), it should be removed from the allocation available plist. If any slot is freed, the swap device should be added back to the plist. Additionally, during swapon or swapoff, the swap device is forcefully added or removed. Currently, the condition (inuse_pages == pages) is checked after every counter update, then remove or add the device accordingly. This is serialized by si->lock. This commit decouples it from the protection of si->lock and reworked plist removal and adding, making it possible to get rid of the hard dependency on si->lock in allocation path in later commits. To achieve this, simply using another lock is not an optimal approach, as the overhead is observable for a hot counter, and may cause complex locking issues. Thus, this commit manages to make it a lock-free atomic operation, by embedding the plist state into the second highest bit of the atomic counter. Simply making the counter an atomic will not work, if the update and plist status check are not performed atomically, we may miss an addition or removal. With the embedded info we can update the counter and check the plist status with single atomic operations, and avoid any extra overheads: If the counter is full (inuse_pages == pages) and the off-list bit is unset, we attempt to remove it from the plist. If the counter is not full (inuse_pages != pages) and the off-list bit is set, we attempt to add it to the plist. Removing, adding and bit update is serialized with a lock, which is a cold path. Ordinary counter updates will be lock-free. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Kairui Song <[email protected]> Cc: Baoquan He <[email protected]> Cc: Barry Song <[email protected]> Cc: Chis Li <[email protected]> Cc: "Huang, Ying" <[email protected]> Cc: Hugh Dickens <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Kalesh Singh <[email protected]> Cc: Nhat Pham <[email protected]> Cc: Ryan Roberts <[email protected]> Cc: Yosry Ahmed <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 2770152 commit b228386

File tree

2 files changed

+138
-50
lines changed

2 files changed

+138
-50
lines changed

include/linux/swap.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ struct swap_info_struct {
306306
/* list of cluster that are fragmented or contented */
307307
unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
308308
unsigned int pages; /* total of usable pages of swap */
309-
unsigned int inuse_pages; /* number of those currently in use */
309+
atomic_long_t inuse_pages; /* number of those currently in use */
310310
struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
311311
struct rb_root swap_extent_root;/* root of the swap extent rbtree */
312312
struct block_device *bdev; /* swap device or bdev of swap file */

mm/swapfile.c

Lines changed: 137 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,26 @@ static inline unsigned char swap_count(unsigned char ent)
128128
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
129129
}
130130

131+
/*
132+
* Use the second highest bit of inuse_pages counter as the indicator
133+
* if one swap device is on the available plist, so the atomic can
134+
* still be updated arithmetically while having special data embedded.
135+
*
136+
* inuse_pages counter is the only thing indicating if a device should
137+
* be on avail_lists or not (except swapon / swapoff). By embedding the
138+
* off-list bit in the atomic counter, updates no longer need any lock
139+
* to check the list status.
140+
*
141+
* This bit will be set if the device is not on the plist and not
142+
* usable, will be cleared if the device is on the plist.
143+
*/
144+
#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2))
145+
#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT)
146+
static long swap_usage_in_pages(struct swap_info_struct *si)
147+
{
148+
return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK;
149+
}
150+
131151
/* Reclaim the swap entry anyway if possible */
132152
#define TTRS_ANYWAY 0x1
133153
/*
@@ -717,7 +737,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
717737
int nr_reclaim;
718738

719739
if (force)
720-
to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
740+
to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
721741

722742
while (!list_empty(&si->full_clusters)) {
723743
ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list);
@@ -872,42 +892,128 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
872892
return found;
873893
}
874894

875-
static void __del_from_avail_list(struct swap_info_struct *si)
895+
/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
896+
static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
876897
{
877898
int nid;
899+
unsigned long pages;
900+
901+
spin_lock(&swap_avail_lock);
902+
903+
if (swapoff) {
904+
/*
905+
* Forcefully remove it. Clear the SWP_WRITEOK flags for
906+
* swapoff here so it's synchronized by both si->lock and
907+
* swap_avail_lock, to ensure the result can be seen by
908+
* add_to_avail_list.
909+
*/
910+
lockdep_assert_held(&si->lock);
911+
si->flags &= ~SWP_WRITEOK;
912+
atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
913+
} else {
914+
/*
915+
* If not called by swapoff, take it off-list only if it's
916+
* full and SWAP_USAGE_OFFLIST_BIT is not set (strictly
917+
* si->inuse_pages == pages), any concurrent slot freeing,
918+
* or device already removed from plist by someone else
919+
* will make this return false.
920+
*/
921+
pages = si->pages;
922+
if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
923+
pages | SWAP_USAGE_OFFLIST_BIT))
924+
goto skip;
925+
}
878926

879-
assert_spin_locked(&si->lock);
880927
for_each_node(nid)
881928
plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
929+
930+
skip:
931+
spin_unlock(&swap_avail_lock);
882932
}
883933

884-
static void del_from_avail_list(struct swap_info_struct *si)
934+
/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
935+
static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
885936
{
937+
int nid;
938+
long val;
939+
unsigned long pages;
940+
886941
spin_lock(&swap_avail_lock);
887-
__del_from_avail_list(si);
942+
943+
/* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */
944+
if (swapon) {
945+
lockdep_assert_held(&si->lock);
946+
si->flags |= SWP_WRITEOK;
947+
} else {
948+
if (!(READ_ONCE(si->flags) & SWP_WRITEOK))
949+
goto skip;
950+
}
951+
952+
if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT))
953+
goto skip;
954+
955+
val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
956+
957+
/*
958+
* When device is full and device is on the plist, only one updater will
959+
* see (inuse_pages == si->pages) and will call del_from_avail_list. If
960+
* that updater happen to be here, just skip adding.
961+
*/
962+
pages = si->pages;
963+
if (val == pages) {
964+
/* Just like the cmpxchg in del_from_avail_list */
965+
if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
966+
pages | SWAP_USAGE_OFFLIST_BIT))
967+
goto skip;
968+
}
969+
970+
for_each_node(nid)
971+
plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
972+
973+
skip:
888974
spin_unlock(&swap_avail_lock);
889975
}
890976

891-
static void swap_range_alloc(struct swap_info_struct *si,
892-
unsigned int nr_entries)
977+
/*
978+
* swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock
979+
* within each cluster, so the total contribution to the global counter should
980+
* always be positive and cannot exceed the total number of usable slots.
981+
*/
982+
static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries)
893983
{
894-
WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
895-
if (si->inuse_pages == si->pages) {
896-
del_from_avail_list(si);
984+
long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages);
897985

898-
if (vm_swap_full())
899-
schedule_work(&si->reclaim_work);
986+
/*
987+
* If device is full, and SWAP_USAGE_OFFLIST_BIT is not set,
988+
* remove it from the plist.
989+
*/
990+
if (unlikely(val == si->pages)) {
991+
del_from_avail_list(si, false);
992+
return true;
900993
}
994+
995+
return false;
901996
}
902997

903-
static void add_to_avail_list(struct swap_info_struct *si)
998+
static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries)
904999
{
905-
int nid;
1000+
long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages);
9061001

907-
spin_lock(&swap_avail_lock);
908-
for_each_node(nid)
909-
plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
910-
spin_unlock(&swap_avail_lock);
1002+
/*
1003+
* If device is not full, and SWAP_USAGE_OFFLIST_BIT is set,
1004+
* remove it from the plist.
1005+
*/
1006+
if (unlikely(val & SWAP_USAGE_OFFLIST_BIT))
1007+
add_to_avail_list(si, false);
1008+
}
1009+
1010+
static void swap_range_alloc(struct swap_info_struct *si,
1011+
unsigned int nr_entries)
1012+
{
1013+
if (swap_usage_add(si, nr_entries)) {
1014+
if (vm_swap_full())
1015+
schedule_work(&si->reclaim_work);
1016+
}
9111017
}
9121018

9131019
static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
@@ -925,8 +1031,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
9251031
for (i = 0; i < nr_entries; i++)
9261032
clear_bit(offset + i, si->zeromap);
9271033

928-
if (si->inuse_pages == si->pages)
929-
add_to_avail_list(si);
9301034
if (si->flags & SWP_BLKDEV)
9311035
swap_slot_free_notify =
9321036
si->bdev->bd_disk->fops->swap_slot_free_notify;
@@ -946,7 +1050,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
9461050
*/
9471051
smp_wmb();
9481052
atomic_long_add(nr_entries, &nr_swap_pages);
949-
WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
1053+
swap_usage_sub(si, nr_entries);
9501054
}
9511055

9521056
static int cluster_alloc_swap(struct swap_info_struct *si,
@@ -1036,19 +1140,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
10361140
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
10371141
spin_unlock(&swap_avail_lock);
10381142
spin_lock(&si->lock);
1039-
if ((si->inuse_pages == si->pages) || !(si->flags & SWP_WRITEOK)) {
1040-
spin_lock(&swap_avail_lock);
1041-
if (plist_node_empty(&si->avail_lists[node])) {
1042-
spin_unlock(&si->lock);
1043-
goto nextsi;
1044-
}
1045-
WARN(!(si->flags & SWP_WRITEOK),
1046-
"swap_info %d in list but !SWP_WRITEOK\n",
1047-
si->type);
1048-
__del_from_avail_list(si);
1049-
spin_unlock(&si->lock);
1050-
goto nextsi;
1051-
}
10521143
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
10531144
n_goal, swp_entries, order);
10541145
spin_unlock(&si->lock);
@@ -1057,7 +1148,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
10571148
cond_resched();
10581149

10591150
spin_lock(&swap_avail_lock);
1060-
nextsi:
10611151
/*
10621152
* if we got here, it's likely that si was almost full before,
10631153
* and since scan_swap_map_slots() can drop the si->lock,
@@ -1789,7 +1879,7 @@ unsigned int count_swap_pages(int type, int free)
17891879
if (sis->flags & SWP_WRITEOK) {
17901880
n = sis->pages;
17911881
if (free)
1792-
n -= sis->inuse_pages;
1882+
n -= swap_usage_in_pages(sis);
17931883
}
17941884
spin_unlock(&sis->lock);
17951885
}
@@ -2124,7 +2214,7 @@ static int try_to_unuse(unsigned int type)
21242214
swp_entry_t entry;
21252215
unsigned int i;
21262216

2127-
if (!READ_ONCE(si->inuse_pages))
2217+
if (!swap_usage_in_pages(si))
21282218
goto success;
21292219

21302220
retry:
@@ -2137,7 +2227,7 @@ static int try_to_unuse(unsigned int type)
21372227

21382228
spin_lock(&mmlist_lock);
21392229
p = &init_mm.mmlist;
2140-
while (READ_ONCE(si->inuse_pages) &&
2230+
while (swap_usage_in_pages(si) &&
21412231
!signal_pending(current) &&
21422232
(p = p->next) != &init_mm.mmlist) {
21432233

@@ -2165,7 +2255,7 @@ static int try_to_unuse(unsigned int type)
21652255
mmput(prev_mm);
21662256

21672257
i = 0;
2168-
while (READ_ONCE(si->inuse_pages) &&
2258+
while (swap_usage_in_pages(si) &&
21692259
!signal_pending(current) &&
21702260
(i = find_next_to_unuse(si, i)) != 0) {
21712261

@@ -2200,7 +2290,7 @@ static int try_to_unuse(unsigned int type)
22002290
* folio_alloc_swap(), temporarily hiding that swap. It's easy
22012291
* and robust (though cpu-intensive) just to keep retrying.
22022292
*/
2203-
if (READ_ONCE(si->inuse_pages)) {
2293+
if (swap_usage_in_pages(si)) {
22042294
if (!signal_pending(current))
22052295
goto retry;
22062296
return -EINTR;
@@ -2227,7 +2317,7 @@ static void drain_mmlist(void)
22272317
unsigned int type;
22282318

22292319
for (type = 0; type < nr_swapfiles; type++)
2230-
if (swap_info[type]->inuse_pages)
2320+
if (swap_usage_in_pages(swap_info[type]))
22312321
return;
22322322
spin_lock(&mmlist_lock);
22332323
list_for_each_safe(p, next, &init_mm.mmlist)
@@ -2406,7 +2496,6 @@ static void setup_swap_info(struct swap_info_struct *si, int prio,
24062496

24072497
static void _enable_swap_info(struct swap_info_struct *si)
24082498
{
2409-
si->flags |= SWP_WRITEOK;
24102499
atomic_long_add(si->pages, &nr_swap_pages);
24112500
total_swap_pages += si->pages;
24122501

@@ -2423,9 +2512,8 @@ static void _enable_swap_info(struct swap_info_struct *si)
24232512
*/
24242513
plist_add(&si->list, &swap_active_head);
24252514

2426-
/* add to available list if swap device is not full */
2427-
if (si->inuse_pages < si->pages)
2428-
add_to_avail_list(si);
2515+
/* Add back to available list */
2516+
add_to_avail_list(si, true);
24292517
}
24302518

24312519
static void enable_swap_info(struct swap_info_struct *si, int prio,
@@ -2523,7 +2611,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
25232611
goto out_dput;
25242612
}
25252613
spin_lock(&p->lock);
2526-
del_from_avail_list(p);
2614+
del_from_avail_list(p, true);
25272615
if (p->prio < 0) {
25282616
struct swap_info_struct *si = p;
25292617
int nid;
@@ -2541,7 +2629,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
25412629
plist_del(&p->list, &swap_active_head);
25422630
atomic_long_sub(p->pages, &nr_swap_pages);
25432631
total_swap_pages -= p->pages;
2544-
p->flags &= ~SWP_WRITEOK;
25452632
spin_unlock(&p->lock);
25462633
spin_unlock(&swap_lock);
25472634

@@ -2721,7 +2808,7 @@ static int swap_show(struct seq_file *swap, void *v)
27212808
}
27222809

27232810
bytes = K(si->pages);
2724-
inuse = K(READ_ONCE(si->inuse_pages));
2811+
inuse = K(swap_usage_in_pages(si));
27252812

27262813
file = si->swap_file;
27272814
len = seq_file_path(swap, file, " \t\n\\");
@@ -2838,6 +2925,7 @@ static struct swap_info_struct *alloc_swap_info(void)
28382925
}
28392926
spin_lock_init(&p->lock);
28402927
spin_lock_init(&p->cont_lock);
2928+
atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT);
28412929
init_completion(&p->comp);
28422930

28432931
return p;
@@ -3335,7 +3423,7 @@ void si_swapinfo(struct sysinfo *val)
33353423
struct swap_info_struct *si = swap_info[type];
33363424

33373425
if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
3338-
nr_to_be_unused += READ_ONCE(si->inuse_pages);
3426+
nr_to_be_unused += swap_usage_in_pages(si);
33393427
}
33403428
val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
33413429
val->totalswap = total_swap_pages + nr_to_be_unused;

0 commit comments

Comments
 (0)