Skip to content

Commit 63d8620

Browse files
MiaoheLintorvalds
authored andcommitted
mm/swapfile: use percpu_ref to serialize against concurrent swapoff
Patch series "close various race windows for swap", v6. When I was investigating the swap code, I found some possible race windows. This series aims to fix all these races. But using current get/put_swap_device() to guard against concurrent swapoff for swap_readpage() looks terrible because swap_readpage() may take really long time. And to reduce the performance overhead on the hot-path as much as possible, it appears we can use the percpu_ref to close this race window(as suggested by Huang, Ying). The patch 1 adds percpu_ref support for swap and most of the remaining patches try to use this to close various race windows. More details can be found in the respective changelogs. This patch (of 4): Using current get/put_swap_device() to guard against concurrent swapoff for some swap ops, e.g. swap_readpage(), looks terrible because they might take really long time. This patch adds the percpu_ref support to serialize against concurrent swapoff(as suggested by Huang, Ying). Also we remove the SWP_VALID flag because it's used together with RCU solution. Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Miaohe Lin <[email protected]> Reviewed-by: "Huang, Ying" <[email protected]> Cc: Alex Shi <[email protected]> Cc: David Hildenbrand <[email protected]> Cc: Dennis Zhou <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Joonsoo Kim <[email protected]> Cc: Matthew Wilcox <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Minchan Kim <[email protected]> Cc: Tim Chen <[email protected]> Cc: Wei Yang <[email protected]> Cc: Yang Shi <[email protected]> Cc: Yu Zhao <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent e17eae2 commit 63d8620

File tree

2 files changed

+52
-32
lines changed

2 files changed

+52
-32
lines changed

include/linux/swap.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,6 @@ enum {
177177
SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */
178178
SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
179179
SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
180-
SWP_VALID = (1 << 13), /* swap is valid to be operated on? */
181180
/* add others here before... */
182181
SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */
183182
};
@@ -240,6 +239,7 @@ struct swap_cluster_list {
240239
* The in-memory structure used to track swap areas.
241240
*/
242241
struct swap_info_struct {
242+
struct percpu_ref users; /* indicate and keep swap device valid. */
243243
unsigned long flags; /* SWP_USED etc: see above */
244244
signed short prio; /* swap priority of this type */
245245
struct plist_node list; /* entry in swap_active_head */
@@ -260,6 +260,7 @@ struct swap_info_struct {
260260
struct block_device *bdev; /* swap device or bdev of swap file */
261261
struct file *swap_file; /* seldom referenced */
262262
unsigned int old_block_size; /* seldom referenced */
263+
struct completion comp; /* seldom referenced */
263264
#ifdef CONFIG_FRONTSWAP
264265
unsigned long *frontswap_map; /* frontswap in-use, one bit per page */
265266
atomic_t frontswap_pages; /* frontswap pages in-use counter */
@@ -511,7 +512,7 @@ sector_t swap_page_sector(struct page *page);
511512

512513
static inline void put_swap_device(struct swap_info_struct *si)
513514
{
514-
rcu_read_unlock();
515+
percpu_ref_put(&si->users);
515516
}
516517

517518
#else /* CONFIG_SWAP */

mm/swapfile.c

Lines changed: 49 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
#include <linux/export.h>
4040
#include <linux/swap_slots.h>
4141
#include <linux/sort.h>
42+
#include <linux/completion.h>
4243

4344
#include <asm/tlbflush.h>
4445
#include <linux/swapops.h>
@@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
511512
spin_unlock(&si->lock);
512513
}
513514

515+
static void swap_users_ref_free(struct percpu_ref *ref)
516+
{
517+
struct swap_info_struct *si;
518+
519+
si = container_of(ref, struct swap_info_struct, users);
520+
complete(&si->comp);
521+
}
522+
514523
static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
515524
{
516525
struct swap_cluster_info *ci = si->cluster_info;
@@ -1270,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
12701279
* via preventing the swap device from being swapoff, until
12711280
* put_swap_device() is called. Otherwise return NULL.
12721281
*
1273-
* The entirety of the RCU read critical section must come before the
1274-
* return from or after the call to synchronize_rcu() in
1275-
* enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
1276-
* true, the si->map, si->cluster_info, etc. must be valid in the
1277-
* critical section.
1278-
*
12791282
* Notice that swapoff or swapoff+swapon can still happen before the
1280-
* rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
1281-
* in put_swap_device() if there isn't any other way to prevent
1282-
* swapoff, such as page lock, page table lock, etc. The caller must
1283-
* be prepared for that. For example, the following situation is
1284-
* possible.
1283+
* percpu_ref_tryget_live() in get_swap_device() or after the
1284+
* percpu_ref_put() in put_swap_device() if there isn't any other way
1285+
* to prevent swapoff, such as page lock, page table lock, etc. The
1286+
* caller must be prepared for that. For example, the following
1287+
* situation is possible.
12851288
*
12861289
* CPU1 CPU2
12871290
* do_swap_page()
@@ -1309,21 +1312,27 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
13091312
si = swp_swap_info(entry);
13101313
if (!si)
13111314
goto bad_nofile;
1312-
1313-
rcu_read_lock();
1314-
if (data_race(!(si->flags & SWP_VALID)))
1315-
goto unlock_out;
1315+
if (!percpu_ref_tryget_live(&si->users))
1316+
goto out;
1317+
/*
1318+
* Guarantee the si->users are checked before accessing other
1319+
* fields of swap_info_struct.
1320+
*
1321+
* Paired with the spin_unlock() after setup_swap_info() in
1322+
* enable_swap_info().
1323+
*/
1324+
smp_rmb();
13161325
offset = swp_offset(entry);
13171326
if (offset >= si->max)
1318-
goto unlock_out;
1327+
goto put_out;
13191328

13201329
return si;
13211330
bad_nofile:
13221331
pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
13231332
out:
13241333
return NULL;
1325-
unlock_out:
1326-
rcu_read_unlock();
1334+
put_out:
1335+
percpu_ref_put(&si->users);
13271336
return NULL;
13281337
}
13291338

@@ -2466,7 +2475,7 @@ static void setup_swap_info(struct swap_info_struct *p, int prio,
24662475

24672476
static void _enable_swap_info(struct swap_info_struct *p)
24682477
{
2469-
p->flags |= SWP_WRITEOK | SWP_VALID;
2478+
p->flags |= SWP_WRITEOK;
24702479
atomic_long_add(p->pages, &nr_swap_pages);
24712480
total_swap_pages += p->pages;
24722481

@@ -2497,10 +2506,9 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
24972506
spin_unlock(&p->lock);
24982507
spin_unlock(&swap_lock);
24992508
/*
2500-
* Guarantee swap_map, cluster_info, etc. fields are valid
2501-
* between get/put_swap_device() if SWP_VALID bit is set
2509+
* Finished initializing swap device, now it's safe to reference it.
25022510
*/
2503-
synchronize_rcu();
2511+
percpu_ref_resurrect(&p->users);
25042512
spin_lock(&swap_lock);
25052513
spin_lock(&p->lock);
25062514
_enable_swap_info(p);
@@ -2616,16 +2624,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
26162624

26172625
reenable_swap_slots_cache_unlock();
26182626

2619-
spin_lock(&swap_lock);
2620-
spin_lock(&p->lock);
2621-
p->flags &= ~SWP_VALID; /* mark swap device as invalid */
2622-
spin_unlock(&p->lock);
2623-
spin_unlock(&swap_lock);
26242627
/*
2625-
* wait for swap operations protected by get/put_swap_device()
2626-
* to complete
2628+
* Wait for swap operations protected by get/put_swap_device()
2629+
* to complete.
2630+
*
2631+
* We need synchronize_rcu() here to protect the accessing to
2632+
* the swap cache data structure.
26272633
*/
2634+
percpu_ref_kill(&p->users);
26282635
synchronize_rcu();
2636+
wait_for_completion(&p->comp);
26292637

26302638
flush_work(&p->discard_work);
26312639

@@ -2857,13 +2865,20 @@ static struct swap_info_struct *alloc_swap_info(void)
28572865
if (!p)
28582866
return ERR_PTR(-ENOMEM);
28592867

2868+
if (percpu_ref_init(&p->users, swap_users_ref_free,
2869+
PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
2870+
kvfree(p);
2871+
return ERR_PTR(-ENOMEM);
2872+
}
2873+
28602874
spin_lock(&swap_lock);
28612875
for (type = 0; type < nr_swapfiles; type++) {
28622876
if (!(swap_info[type]->flags & SWP_USED))
28632877
break;
28642878
}
28652879
if (type >= MAX_SWAPFILES) {
28662880
spin_unlock(&swap_lock);
2881+
percpu_ref_exit(&p->users);
28672882
kvfree(p);
28682883
return ERR_PTR(-EPERM);
28692884
}
@@ -2891,9 +2906,13 @@ static struct swap_info_struct *alloc_swap_info(void)
28912906
plist_node_init(&p->avail_lists[i], 0);
28922907
p->flags = SWP_USED;
28932908
spin_unlock(&swap_lock);
2894-
kvfree(defer);
2909+
if (defer) {
2910+
percpu_ref_exit(&defer->users);
2911+
kvfree(defer);
2912+
}
28952913
spin_lock_init(&p->lock);
28962914
spin_lock_init(&p->cont_lock);
2915+
init_completion(&p->comp);
28972916

28982917
return p;
28992918
}

0 commit comments

Comments
 (0)