Skip to content

Commit 8cc5fcb

Browse files
minatorvalds
authored andcommitted
mm, hugetlb: fix racy resv_huge_pages underflow on UFFDIO_COPY
On UFFDIO_COPY, if we fail to copy the page contents while holding the hugetlb_fault_mutex, we will drop the mutex and return to the caller after allocating a page that consumed a reservation. In this case there may be a fault that double consumes the reservation. To handle this, we free the allocated page, fix the reservations, and allocate a temporary hugetlb page and return that to the caller. When the caller does the copy outside of the lock, we again check the cache, and allocate a page consuming the reservation, and copy over the contents. Test: Hacked the code locally such that resv_huge_pages underflows produce a warning and the copy_huge_page_from_user() always fails, then: ./tools/testing/selftests/vm/userfaultfd hugetlb_shared 10 2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success ./tools/testing/selftests/vm/userfaultfd hugetlb 10 2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success Both tests succeed and produce no warnings. After the test runs number of free/resv hugepages is correct. [[email protected]: remove set but not used variable 'vm_alloc_shared'] Link: https://lkml.kernel.org/r/[email protected] [[email protected]: fix allocation error check and copy func name] Link: https://lkml.kernel.org/r/[email protected] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Mina Almasry <[email protected]> Signed-off-by: YueHaibing <[email protected]> Cc: Axel Rasmussen <[email protected]> Cc: Peter Xu <[email protected]> Cc: Mike Kravetz <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 22f3c95 commit 8cc5fcb

File tree

4 files changed

+45
-59
lines changed

4 files changed

+45
-59
lines changed

include/linux/migrate.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
5151
struct page *newpage, struct page *page);
5252
extern int migrate_page_move_mapping(struct address_space *mapping,
5353
struct page *newpage, struct page *page, int extra_count);
54+
extern void copy_huge_page(struct page *dst, struct page *src);
5455
#else
5556

5657
static inline void putback_movable_pages(struct list_head *l) {}
@@ -77,6 +78,9 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
7778
return -ENOSYS;
7879
}
7980

81+
static inline void copy_huge_page(struct page *dst, struct page *src)
82+
{
83+
}
8084
#endif /* CONFIG_MIGRATION */
8185

8286
#ifdef CONFIG_COMPACTION

mm/hugetlb.c

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include <linux/numa.h>
3131
#include <linux/llist.h>
3232
#include <linux/cma.h>
33+
#include <linux/migrate.h>
3334

3435
#include <asm/page.h>
3536
#include <asm/pgalloc.h>
@@ -5076,20 +5077,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
50765077
struct page **pagep)
50775078
{
50785079
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
5079-
struct address_space *mapping;
5080-
pgoff_t idx;
5080+
struct hstate *h = hstate_vma(dst_vma);
5081+
struct address_space *mapping = dst_vma->vm_file->f_mapping;
5082+
pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
50815083
unsigned long size;
50825084
int vm_shared = dst_vma->vm_flags & VM_SHARED;
5083-
struct hstate *h = hstate_vma(dst_vma);
50845085
pte_t _dst_pte;
50855086
spinlock_t *ptl;
5086-
int ret;
5087+
int ret = -ENOMEM;
50875088
struct page *page;
50885089
int writable;
50895090

5090-
mapping = dst_vma->vm_file->f_mapping;
5091-
idx = vma_hugecache_offset(h, dst_vma, dst_addr);
5092-
50935091
if (is_continue) {
50945092
ret = -EFAULT;
50955093
page = find_lock_page(mapping, idx);
@@ -5118,12 +5116,44 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
51185116
/* fallback to copy_from_user outside mmap_lock */
51195117
if (unlikely(ret)) {
51205118
ret = -ENOENT;
5119+
/* Free the allocated page which may have
5120+
* consumed a reservation.
5121+
*/
5122+
restore_reserve_on_error(h, dst_vma, dst_addr, page);
5123+
put_page(page);
5124+
5125+
/* Allocate a temporary page to hold the copied
5126+
* contents.
5127+
*/
5128+
page = alloc_huge_page_vma(h, dst_vma, dst_addr);
5129+
if (!page) {
5130+
ret = -ENOMEM;
5131+
goto out;
5132+
}
51215133
*pagep = page;
5122-
/* don't free the page */
5134+
/* Set the outparam pagep and return to the caller to
5135+
* copy the contents outside the lock. Don't free the
5136+
* page.
5137+
*/
51235138
goto out;
51245139
}
51255140
} else {
5126-
page = *pagep;
5141+
if (vm_shared &&
5142+
hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
5143+
put_page(*pagep);
5144+
ret = -EEXIST;
5145+
*pagep = NULL;
5146+
goto out;
5147+
}
5148+
5149+
page = alloc_huge_page(dst_vma, dst_addr, 0);
5150+
if (IS_ERR(page)) {
5151+
ret = -ENOMEM;
5152+
*pagep = NULL;
5153+
goto out;
5154+
}
5155+
copy_huge_page(page, *pagep);
5156+
put_page(*pagep);
51275157
*pagep = NULL;
51285158
}
51295159

mm/migrate.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -553,7 +553,7 @@ static void __copy_gigantic_page(struct page *dst, struct page *src,
553553
}
554554
}
555555

556-
static void copy_huge_page(struct page *dst, struct page *src)
556+
void copy_huge_page(struct page *dst, struct page *src)
557557
{
558558
int i;
559559
int nr_pages;

mm/userfaultfd.c

Lines changed: 1 addition & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
209209
unsigned long len,
210210
enum mcopy_atomic_mode mode)
211211
{
212-
int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
213212
int vm_shared = dst_vma->vm_flags & VM_SHARED;
214213
ssize_t err;
215214
pte_t *dst_pte;
@@ -308,7 +307,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
308307

309308
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
310309
i_mmap_unlock_read(mapping);
311-
vm_alloc_shared = vm_shared;
312310

313311
cond_resched();
314312

@@ -346,54 +344,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
346344
out_unlock:
347345
mmap_read_unlock(dst_mm);
348346
out:
349-
if (page) {
350-
/*
351-
* We encountered an error and are about to free a newly
352-
* allocated huge page.
353-
*
354-
* Reservation handling is very subtle, and is different for
355-
* private and shared mappings. See the routine
356-
* restore_reserve_on_error for details. Unfortunately, we
357-
* can not call restore_reserve_on_error now as it would
358-
* require holding mmap_lock.
359-
*
360-
* If a reservation for the page existed in the reservation
361-
* map of a private mapping, the map was modified to indicate
362-
* the reservation was consumed when the page was allocated.
363-
* We clear the HPageRestoreReserve flag now so that the global
364-
* reserve count will not be incremented in free_huge_page.
365-
* The reservation map will still indicate the reservation
366-
* was consumed and possibly prevent later page allocation.
367-
* This is better than leaking a global reservation. If no
368-
* reservation existed, it is still safe to clear
369-
* HPageRestoreReserve as no adjustments to reservation counts
370-
* were made during allocation.
371-
*
372-
* The reservation map for shared mappings indicates which
373-
* pages have reservations. When a huge page is allocated
374-
* for an address with a reservation, no change is made to
375-
* the reserve map. In this case HPageRestoreReserve will be
376-
* set to indicate that the global reservation count should be
377-
* incremented when the page is freed. This is the desired
378-
* behavior. However, when a huge page is allocated for an
379-
* address without a reservation a reservation entry is added
380-
* to the reservation map, and HPageRestoreReserve will not be
381-
* set. When the page is freed, the global reserve count will
382-
* NOT be incremented and it will appear as though we have
383-
* leaked reserved page. In this case, set HPageRestoreReserve
384-
* so that the global reserve count will be incremented to
385-
* match the reservation map entry which was created.
386-
*
387-
* Note that vm_alloc_shared is based on the flags of the vma
388-
* for which the page was originally allocated. dst_vma could
389-
* be different or NULL on error.
390-
*/
391-
if (vm_alloc_shared)
392-
SetHPageRestoreReserve(page);
393-
else
394-
ClearHPageRestoreReserve(page);
347+
if (page)
395348
put_page(page);
396-
}
397349
BUG_ON(copied < 0);
398350
BUG_ON(err > 0);
399351
BUG_ON(!copied && !err);

0 commit comments

Comments
 (0)