Skip to content

Commit 4aae8d1

Browse files
mjkravetztorvalds
authored andcommitted
mm/hugetlbfs: unmap pages if page fault raced with hole punch
Page faults can race with fallocate hole punch. If a page fault happens between the unmap and remove operations, the page is not removed and remains within the hole. This is not the desired behavior. The race is difficult to detect in user level code as even in the non-race case, a page within the hole could be faulted back in before fallocate returns. If userfaultfd is expanded to support hugetlbfs in the future, this race will be easier to observe. If this race is detected and a page is mapped, the remove operation (remove_inode_hugepages) will unmap the page before removing. The unmap within remove_inode_hugepages occurs with the hugetlb_fault_mutex held so that no other faults will be processed until the page is removed. The (unmodified) routine hugetlb_vmdelete_list was moved ahead of remove_inode_hugepages to satisfy the new reference. [[email protected]: move hugetlb_vmdelete_list()] Signed-off-by: Mike Kravetz <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: Naoya Horiguchi <[email protected]> Cc: Hillf Danton <[email protected]> Cc: Davidlohr Bueso <[email protected]> Cc: Dave Hansen <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 9aacdd3 commit 4aae8d1

File tree

1 file changed

+75
-69
lines changed

1 file changed

+75
-69
lines changed

fs/hugetlbfs/inode.c

Lines changed: 75 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -324,11 +324,48 @@ static void remove_huge_page(struct page *page)
324324
delete_from_page_cache(page);
325325
}
326326

327+
static void
328+
hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
329+
{
330+
struct vm_area_struct *vma;
331+
332+
/*
333+
* end == 0 indicates that the entire range after
334+
* start should be unmapped.
335+
*/
336+
vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
337+
unsigned long v_offset;
338+
unsigned long v_end;
339+
340+
/*
341+
* Can the expression below overflow on 32-bit arches?
342+
* No, because the interval tree returns us only those vmas
343+
* which overlap the truncated area starting at pgoff,
344+
* and no vma on a 32-bit arch can span beyond the 4GB.
345+
*/
346+
if (vma->vm_pgoff < start)
347+
v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
348+
else
349+
v_offset = 0;
350+
351+
if (!end)
352+
v_end = vma->vm_end;
353+
else {
354+
v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
355+
+ vma->vm_start;
356+
if (v_end > vma->vm_end)
357+
v_end = vma->vm_end;
358+
}
359+
360+
unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
361+
NULL);
362+
}
363+
}
327364

328365
/*
329366
* remove_inode_hugepages handles two distinct cases: truncation and hole
330367
* punch. There are subtle differences in operation for each case.
331-
368+
*
332369
* truncation is indicated by end of range being LLONG_MAX
333370
* In this case, we first scan the range and release found pages.
334371
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
@@ -379,6 +416,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
379416

380417
for (i = 0; i < pagevec_count(&pvec); ++i) {
381418
struct page *page = pvec.pages[i];
419+
bool rsv_on_error;
382420
u32 hash;
383421

384422
/*
@@ -395,37 +433,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
395433
mapping, next, 0);
396434
mutex_lock(&hugetlb_fault_mutex_table[hash]);
397435

398-
lock_page(page);
399-
if (likely(!page_mapped(page))) {
400-
bool rsv_on_error = !PagePrivate(page);
401-
/*
402-
* We must free the huge page and remove
403-
* from page cache (remove_huge_page) BEFORE
404-
* removing the region/reserve map
405-
* (hugetlb_unreserve_pages). In rare out
406-
* of memory conditions, removal of the
407-
* region/reserve map could fail. Before
408-
* free'ing the page, note PagePrivate which
409-
* is used in case of error.
410-
*/
411-
remove_huge_page(page);
412-
freed++;
413-
if (!truncate_op) {
414-
if (unlikely(hugetlb_unreserve_pages(
415-
inode, next,
416-
next + 1, 1)))
417-
hugetlb_fix_reserve_counts(
418-
inode, rsv_on_error);
419-
}
420-
} else {
421-
/*
422-
* If page is mapped, it was faulted in after
423-
* being unmapped. It indicates a race between
424-
* hole punch and page fault. Do nothing in
425-
* this case. Getting here in a truncate
426-
* operation is a bug.
427-
*/
436+
/*
437+
* If page is mapped, it was faulted in after being
438+
* unmapped in caller. Unmap (again) now after taking
439+
* the fault mutex. The mutex will prevent faults
440+
* until we finish removing the page.
441+
*
442+
* This race can only happen in the hole punch case.
443+
* Getting here in a truncate operation is a bug.
444+
*/
445+
if (unlikely(page_mapped(page))) {
428446
BUG_ON(truncate_op);
447+
448+
i_mmap_lock_write(mapping);
449+
hugetlb_vmdelete_list(&mapping->i_mmap,
450+
next * pages_per_huge_page(h),
451+
(next + 1) * pages_per_huge_page(h));
452+
i_mmap_unlock_write(mapping);
453+
}
454+
455+
lock_page(page);
456+
/*
457+
* We must free the huge page and remove from page
458+
* cache (remove_huge_page) BEFORE removing the
459+
* region/reserve map (hugetlb_unreserve_pages). In
460+
* rare out of memory conditions, removal of the
461+
* region/reserve map could fail. Before free'ing
462+
* the page, note PagePrivate which is used in case
463+
* of error.
464+
*/
465+
rsv_on_error = !PagePrivate(page);
466+
remove_huge_page(page);
467+
freed++;
468+
if (!truncate_op) {
469+
if (unlikely(hugetlb_unreserve_pages(inode,
470+
next, next + 1, 1)))
471+
hugetlb_fix_reserve_counts(inode,
472+
rsv_on_error);
429473
}
430474

431475
unlock_page(page);
@@ -452,44 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
452496
clear_inode(inode);
453497
}
454498

455-
static inline void
456-
hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
457-
{
458-
struct vm_area_struct *vma;
459-
460-
/*
461-
* end == 0 indicates that the entire range after
462-
* start should be unmapped.
463-
*/
464-
vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
465-
unsigned long v_offset;
466-
unsigned long v_end;
467-
468-
/*
469-
* Can the expression below overflow on 32-bit arches?
470-
* No, because the interval tree returns us only those vmas
471-
* which overlap the truncated area starting at pgoff,
472-
* and no vma on a 32-bit arch can span beyond the 4GB.
473-
*/
474-
if (vma->vm_pgoff < start)
475-
v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
476-
else
477-
v_offset = 0;
478-
479-
if (!end)
480-
v_end = vma->vm_end;
481-
else {
482-
v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
483-
+ vma->vm_start;
484-
if (v_end > vma->vm_end)
485-
v_end = vma->vm_end;
486-
}
487-
488-
unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
489-
NULL);
490-
}
491-
}
492-
493499
static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
494500
{
495501
pgoff_t pgoff;

0 commit comments

Comments
 (0)