Skip to content

Commit 9a0ddeb

Browse files
ryncsnakpm00
authored andcommitted
mm, swap: hold a reference during scan and cleanup flag usage
The flag SWP_SCANNING was used as an indicator of whether a device is being scanned for allocation, and prevents swapoff. Combined with SWP_WRITEOK, they work as a set of barriers for a clean swapoff: 1. Swapoff clears SWP_WRITEOK, allocation requests will see ~SWP_WRITEOK and abort as it's serialized by si->lock. 2. Swapoff unuses all allocated entries. 3. Swapoff waits for SWP_SCANNING flag to be cleared, so ongoing allocations will stop, preventing UAF. 4. Now swapoff can free everything safely. This will make the allocation path have a hard dependency on si->lock. Allocation always have to acquire si->lock first for setting SWP_SCANNING and checking SWP_WRITEOK. This commit removes this flag, and just uses the existing per-CPU refcount instead to prevent UAF in step 3, which serves well for such usage without dependency on si->lock, and scales very well too. Just hold a reference during the whole scan and allocation process. Swapoff will kill and wait for the counter. And for preventing any allocation from happening after step 1 so the unuse in step 2 can ensure all slots are free, swapoff will acquire the ci->lock of each cluster one by one to ensure all allocations see ~SWP_WRITEOK and abort. This way these dependences on si->lock are gone. And worth noting we can't kill the refcount as the first step for swapoff as the unuse process have to acquire the refcount. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Kairui Song <[email protected]> Cc: Baoquan He <[email protected]> Cc: Barry Song <[email protected]> Cc: Chis Li <[email protected]> Cc: "Huang, Ying" <[email protected]> Cc: Hugh Dickens <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Kalesh Singh <[email protected]> Cc: Nhat Pham <[email protected]> Cc: Ryan Roberts <[email protected]> Cc: Yosry Ahmed <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent b228386 commit 9a0ddeb

File tree

2 files changed

+57
-34
lines changed

2 files changed

+57
-34
lines changed

include/linux/swap.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,6 @@ enum {
219219
SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */
220220
SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
221221
/* add others here before... */
222-
SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */
223222
};
224223

225224
#define SWAP_CLUSTER_MAX 32UL

mm/swapfile.c

Lines changed: 57 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,8 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
658658
{
659659
unsigned int nr_pages = 1 << order;
660660

661+
lockdep_assert_held(&ci->lock);
662+
661663
if (!(si->flags & SWP_WRITEOK))
662664
return false;
663665

@@ -1059,8 +1061,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
10591061
{
10601062
int n_ret = 0;
10611063

1062-
si->flags += SWP_SCANNING;
1063-
10641064
while (n_ret < nr) {
10651065
unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
10661066

@@ -1069,8 +1069,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
10691069
slots[n_ret++] = swp_entry(si->type, offset);
10701070
}
10711071

1072-
si->flags -= SWP_SCANNING;
1073-
10741072
return n_ret;
10751073
}
10761074

@@ -1112,6 +1110,22 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
11121110
return cluster_alloc_swap(si, usage, nr, slots, order);
11131111
}
11141112

1113+
static bool get_swap_device_info(struct swap_info_struct *si)
1114+
{
1115+
if (!percpu_ref_tryget_live(&si->users))
1116+
return false;
1117+
/*
1118+
* Guarantee the si->users are checked before accessing other
1119+
* fields of swap_info_struct, and si->flags (SWP_WRITEOK) is
1120+
* up to dated.
1121+
*
1122+
* Paired with the spin_unlock() after setup_swap_info() in
1123+
* enable_swap_info(), and smp_wmb() in swapoff.
1124+
*/
1125+
smp_rmb();
1126+
return true;
1127+
}
1128+
11151129
int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
11161130
{
11171131
int order = swap_entry_order(entry_order);
@@ -1139,13 +1153,16 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
11391153
/* requeue si to after same-priority siblings */
11401154
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
11411155
spin_unlock(&swap_avail_lock);
1142-
spin_lock(&si->lock);
1143-
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
1144-
n_goal, swp_entries, order);
1145-
spin_unlock(&si->lock);
1146-
if (n_ret || size > 1)
1147-
goto check_out;
1148-
cond_resched();
1156+
if (get_swap_device_info(si)) {
1157+
spin_lock(&si->lock);
1158+
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
1159+
n_goal, swp_entries, order);
1160+
spin_unlock(&si->lock);
1161+
put_swap_device(si);
1162+
if (n_ret || size > 1)
1163+
goto check_out;
1164+
cond_resched();
1165+
}
11491166

11501167
spin_lock(&swap_avail_lock);
11511168
/*
@@ -1296,16 +1313,8 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
12961313
si = swp_swap_info(entry);
12971314
if (!si)
12981315
goto bad_nofile;
1299-
if (!percpu_ref_tryget_live(&si->users))
1316+
if (!get_swap_device_info(si))
13001317
goto out;
1301-
/*
1302-
* Guarantee the si->users are checked before accessing other
1303-
* fields of swap_info_struct.
1304-
*
1305-
* Paired with the spin_unlock() after setup_swap_info() in
1306-
* enable_swap_info().
1307-
*/
1308-
smp_rmb();
13091318
offset = swp_offset(entry);
13101319
if (offset >= si->max)
13111320
goto put_out;
@@ -1785,10 +1794,13 @@ swp_entry_t get_swap_page_of_type(int type)
17851794
goto fail;
17861795

17871796
/* This is called for allocating swap entry, not cache */
1788-
spin_lock(&si->lock);
1789-
if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
1790-
atomic_long_dec(&nr_swap_pages);
1791-
spin_unlock(&si->lock);
1797+
if (get_swap_device_info(si)) {
1798+
spin_lock(&si->lock);
1799+
if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
1800+
atomic_long_dec(&nr_swap_pages);
1801+
spin_unlock(&si->lock);
1802+
put_swap_device(si);
1803+
}
17921804
fail:
17931805
return entry;
17941806
}
@@ -2562,6 +2574,25 @@ bool has_usable_swap(void)
25622574
return ret;
25632575
}
25642576

2577+
/*
2578+
* Called after clearing SWP_WRITEOK, ensures cluster_alloc_range
2579+
* see the updated flags, so there will be no more allocations.
2580+
*/
2581+
static void wait_for_allocation(struct swap_info_struct *si)
2582+
{
2583+
unsigned long offset;
2584+
unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER);
2585+
struct swap_cluster_info *ci;
2586+
2587+
BUG_ON(si->flags & SWP_WRITEOK);
2588+
2589+
for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
2590+
ci = lock_cluster(si, offset);
2591+
unlock_cluster(ci);
2592+
offset += SWAPFILE_CLUSTER;
2593+
}
2594+
}
2595+
25652596
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
25662597
{
25672598
struct swap_info_struct *p = NULL;
@@ -2632,6 +2663,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
26322663
spin_unlock(&p->lock);
26332664
spin_unlock(&swap_lock);
26342665

2666+
wait_for_allocation(p);
2667+
26352668
disable_swap_slots_cache_lock();
26362669

26372670
set_current_oom_origin();
@@ -2674,15 +2707,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
26742707
spin_lock(&p->lock);
26752708
drain_mmlist();
26762709

2677-
/* wait for anyone still in scan_swap_map_slots */
2678-
while (p->flags >= SWP_SCANNING) {
2679-
spin_unlock(&p->lock);
2680-
spin_unlock(&swap_lock);
2681-
schedule_timeout_uninterruptible(1);
2682-
spin_lock(&swap_lock);
2683-
spin_lock(&p->lock);
2684-
}
2685-
26862710
swap_file = p->swap_file;
26872711
p->swap_file = NULL;
26882712
p->max = 0;

0 commit comments

Comments
 (0)