|
39 | 39 | #include <linux/export.h>
|
40 | 40 | #include <linux/swap_slots.h>
|
41 | 41 | #include <linux/sort.h>
|
| 42 | +#include <linux/completion.h> |
42 | 43 |
|
43 | 44 | #include <asm/tlbflush.h>
|
44 | 45 | #include <linux/swapops.h>
|
@@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
|
511 | 512 | spin_unlock(&si->lock);
|
512 | 513 | }
|
513 | 514 |
|
| 515 | +static void swap_users_ref_free(struct percpu_ref *ref) |
| 516 | +{ |
| 517 | + struct swap_info_struct *si; |
| 518 | + |
| 519 | + si = container_of(ref, struct swap_info_struct, users); |
| 520 | + complete(&si->comp); |
| 521 | +} |
| 522 | + |
514 | 523 | static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
|
515 | 524 | {
|
516 | 525 | struct swap_cluster_info *ci = si->cluster_info;
|
@@ -1270,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
|
1270 | 1279 | * via preventing the swap device from being swapoff, until
|
1271 | 1280 | * put_swap_device() is called. Otherwise return NULL.
|
1272 | 1281 | *
|
1273 |
| - * The entirety of the RCU read critical section must come before the |
1274 |
| - * return from or after the call to synchronize_rcu() in |
1275 |
| - * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is |
1276 |
| - * true, the si->map, si->cluster_info, etc. must be valid in the |
1277 |
| - * critical section. |
1278 |
| - * |
1279 | 1282 | * Notice that swapoff or swapoff+swapon can still happen before the
|
1280 |
| - * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock() |
1281 |
| - * in put_swap_device() if there isn't any other way to prevent |
1282 |
| - * swapoff, such as page lock, page table lock, etc. The caller must |
1283 |
| - * be prepared for that. For example, the following situation is |
1284 |
| - * possible. |
| 1283 | + * percpu_ref_tryget_live() in get_swap_device() or after the |
| 1284 | + * percpu_ref_put() in put_swap_device() if there isn't any other way |
| 1285 | + * to prevent swapoff, such as page lock, page table lock, etc. The |
| 1286 | + * caller must be prepared for that. For example, the following |
| 1287 | + * situation is possible. |
1285 | 1288 | *
|
1286 | 1289 | * CPU1 CPU2
|
1287 | 1290 | * do_swap_page()
|
@@ -1309,21 +1312,27 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
|
1309 | 1312 | si = swp_swap_info(entry);
|
1310 | 1313 | if (!si)
|
1311 | 1314 | goto bad_nofile;
|
1312 |
| - |
1313 |
| - rcu_read_lock(); |
1314 |
| - if (data_race(!(si->flags & SWP_VALID))) |
1315 |
| - goto unlock_out; |
| 1315 | + if (!percpu_ref_tryget_live(&si->users)) |
| 1316 | + goto out; |
| 1317 | + /* |
| 1318 | + * Guarantee the si->users are checked before accessing other |
| 1319 | + * fields of swap_info_struct. |
| 1320 | + * |
| 1321 | + * Paired with the spin_unlock() after setup_swap_info() in |
| 1322 | + * enable_swap_info(). |
| 1323 | + */ |
| 1324 | + smp_rmb(); |
1316 | 1325 | offset = swp_offset(entry);
|
1317 | 1326 | if (offset >= si->max)
|
1318 |
| - goto unlock_out; |
| 1327 | + goto put_out; |
1319 | 1328 |
|
1320 | 1329 | return si;
|
1321 | 1330 | bad_nofile:
|
1322 | 1331 | pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
|
1323 | 1332 | out:
|
1324 | 1333 | return NULL;
|
1325 |
| -unlock_out: |
1326 |
| - rcu_read_unlock(); |
| 1334 | +put_out: |
| 1335 | + percpu_ref_put(&si->users); |
1327 | 1336 | return NULL;
|
1328 | 1337 | }
|
1329 | 1338 |
|
@@ -2466,7 +2475,7 @@ static void setup_swap_info(struct swap_info_struct *p, int prio,
|
2466 | 2475 |
|
2467 | 2476 | static void _enable_swap_info(struct swap_info_struct *p)
|
2468 | 2477 | {
|
2469 |
| - p->flags |= SWP_WRITEOK | SWP_VALID; |
| 2478 | + p->flags |= SWP_WRITEOK; |
2470 | 2479 | atomic_long_add(p->pages, &nr_swap_pages);
|
2471 | 2480 | total_swap_pages += p->pages;
|
2472 | 2481 |
|
@@ -2497,10 +2506,9 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
|
2497 | 2506 | spin_unlock(&p->lock);
|
2498 | 2507 | spin_unlock(&swap_lock);
|
2499 | 2508 | /*
|
2500 |
| - * Guarantee swap_map, cluster_info, etc. fields are valid |
2501 |
| - * between get/put_swap_device() if SWP_VALID bit is set |
| 2509 | + * Finished initializing swap device, now it's safe to reference it. |
2502 | 2510 | */
|
2503 |
| - synchronize_rcu(); |
| 2511 | + percpu_ref_resurrect(&p->users); |
2504 | 2512 | spin_lock(&swap_lock);
|
2505 | 2513 | spin_lock(&p->lock);
|
2506 | 2514 | _enable_swap_info(p);
|
@@ -2616,16 +2624,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
2616 | 2624 |
|
2617 | 2625 | reenable_swap_slots_cache_unlock();
|
2618 | 2626 |
|
2619 |
| - spin_lock(&swap_lock); |
2620 |
| - spin_lock(&p->lock); |
2621 |
| - p->flags &= ~SWP_VALID; /* mark swap device as invalid */ |
2622 |
| - spin_unlock(&p->lock); |
2623 |
| - spin_unlock(&swap_lock); |
2624 | 2627 | /*
|
2625 |
| - * wait for swap operations protected by get/put_swap_device() |
2626 |
| - * to complete |
| 2628 | + * Wait for swap operations protected by get/put_swap_device() |
| 2629 | + * to complete. |
| 2630 | + * |
| 2631 | + * We need synchronize_rcu() here to protect the accessing to |
| 2632 | + * the swap cache data structure. |
2627 | 2633 | */
|
| 2634 | + percpu_ref_kill(&p->users); |
2628 | 2635 | synchronize_rcu();
|
| 2636 | + wait_for_completion(&p->comp); |
2629 | 2637 |
|
2630 | 2638 | flush_work(&p->discard_work);
|
2631 | 2639 |
|
@@ -2857,13 +2865,20 @@ static struct swap_info_struct *alloc_swap_info(void)
|
2857 | 2865 | if (!p)
|
2858 | 2866 | return ERR_PTR(-ENOMEM);
|
2859 | 2867 |
|
| 2868 | + if (percpu_ref_init(&p->users, swap_users_ref_free, |
| 2869 | + PERCPU_REF_INIT_DEAD, GFP_KERNEL)) { |
| 2870 | + kvfree(p); |
| 2871 | + return ERR_PTR(-ENOMEM); |
| 2872 | + } |
| 2873 | + |
2860 | 2874 | spin_lock(&swap_lock);
|
2861 | 2875 | for (type = 0; type < nr_swapfiles; type++) {
|
2862 | 2876 | if (!(swap_info[type]->flags & SWP_USED))
|
2863 | 2877 | break;
|
2864 | 2878 | }
|
2865 | 2879 | if (type >= MAX_SWAPFILES) {
|
2866 | 2880 | spin_unlock(&swap_lock);
|
| 2881 | + percpu_ref_exit(&p->users); |
2867 | 2882 | kvfree(p);
|
2868 | 2883 | return ERR_PTR(-EPERM);
|
2869 | 2884 | }
|
@@ -2891,9 +2906,13 @@ static struct swap_info_struct *alloc_swap_info(void)
|
2891 | 2906 | plist_node_init(&p->avail_lists[i], 0);
|
2892 | 2907 | p->flags = SWP_USED;
|
2893 | 2908 | spin_unlock(&swap_lock);
|
2894 |
| - kvfree(defer); |
| 2909 | + if (defer) { |
| 2910 | + percpu_ref_exit(&defer->users); |
| 2911 | + kvfree(defer); |
| 2912 | + } |
2895 | 2913 | spin_lock_init(&p->lock);
|
2896 | 2914 | spin_lock_init(&p->cont_lock);
|
| 2915 | + init_completion(&p->comp); |
2897 | 2916 |
|
2898 | 2917 | return p;
|
2899 | 2918 | }
|
|
0 commit comments