Skip to content

Commit 550a7d6

Browse files
minatorvalds
authored andcommitted
mm, hugepages: add mremap() support for hugepage backed vma
Support mremap() for hugepage backed vma segment by simply repositioning page table entries. The page table entries are repositioned to the new virtual address on mremap(). Hugetlb mremap() support is of course generic; my motivating use case is a library (hugepage_text), which reloads the ELF text of executables in hugepages. This significantly increases the execution performance of said executables. Restrict the mremap operation on hugepages to up to the size of the original mapping as the underlying hugetlb reservation is not yet capable of handling remapping to a larger size. During the mremap() operation we detect pmd_share'd mappings and we unshare those during the mremap(). On access and fault the sharing is established again. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Mina Almasry <[email protected]> Reviewed-by: Mike Kravetz <[email protected]> Cc: Ken Chen <[email protected]> Cc: Chris Kennelly <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Vlastimil Babka <[email protected]> Cc: Kirill Shutemov <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent bd3400e commit 550a7d6

File tree

3 files changed

+157
-9
lines changed

3 files changed

+157
-9
lines changed

include/linux/hugetlb.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
124124
void hugepage_put_subpool(struct hugepage_subpool *spool);
125125

126126
void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
127+
void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
127128
int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
128129
int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
129130
loff_t *);
@@ -132,6 +133,10 @@ int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *,
132133
int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *,
133134
loff_t *);
134135

136+
int move_hugetlb_page_tables(struct vm_area_struct *vma,
137+
struct vm_area_struct *new_vma,
138+
unsigned long old_addr, unsigned long new_addr,
139+
unsigned long len);
135140
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
136141
long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
137142
struct page **, struct vm_area_struct **,
@@ -215,6 +220,10 @@ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
215220
{
216221
}
217222

223+
static inline void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
224+
{
225+
}
226+
218227
static inline unsigned long hugetlb_total_pages(void)
219228
{
220229
return 0;
@@ -262,6 +271,16 @@ static inline int copy_hugetlb_page_range(struct mm_struct *dst,
262271
return 0;
263272
}
264273

274+
static inline int move_hugetlb_page_tables(struct vm_area_struct *vma,
275+
struct vm_area_struct *new_vma,
276+
unsigned long old_addr,
277+
unsigned long new_addr,
278+
unsigned long len)
279+
{
280+
BUG();
281+
return 0;
282+
}
283+
265284
static inline void hugetlb_report_meminfo(struct seq_file *m)
266285
{
267286
}

mm/hugetlb.c

Lines changed: 105 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,6 +1014,35 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
10141014
vma->vm_private_data = (void *)0;
10151015
}
10161016

1017+
/*
1018+
* Reset and decrement one ref on hugepage private reservation.
1019+
* Called with mm->mmap_sem writer semaphore held.
1020+
* This function should be only used by move_vma() and operate on
1021+
* same sized vma. It should never come here with last ref on the
1022+
* reservation.
1023+
*/
1024+
void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
1025+
{
1026+
/*
1027+
* Clear the old hugetlb private page reservation.
1028+
* It has already been transferred to new_vma.
1029+
*
1030+
* During a mremap() operation of a hugetlb vma we call move_vma()
1031+
* which copies vma into new_vma and unmaps vma. After the copy
1032+
* operation both new_vma and vma share a reference to the resv_map
1033+
* struct, and at that point vma is about to be unmapped. We don't
1034+
* want to return the reservation to the pool at unmap of vma because
1035+
* the reservation still lives on in new_vma, so simply decrement the
1036+
* ref here and remove the resv_map reference from this vma.
1037+
*/
1038+
struct resv_map *reservations = vma_resv_map(vma);
1039+
1040+
if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
1041+
kref_put(&reservations->refs, resv_map_release);
1042+
1043+
reset_vma_resv_huge_pages(vma);
1044+
}
1045+
10171046
/* Returns true if the VMA has associated reserve pages */
10181047
static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
10191048
{
@@ -4718,6 +4747,82 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
47184747
return ret;
47194748
}
47204749

4750+
static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
4751+
unsigned long new_addr, pte_t *src_pte)
4752+
{
4753+
struct hstate *h = hstate_vma(vma);
4754+
struct mm_struct *mm = vma->vm_mm;
4755+
pte_t *dst_pte, pte;
4756+
spinlock_t *src_ptl, *dst_ptl;
4757+
4758+
dst_pte = huge_pte_offset(mm, new_addr, huge_page_size(h));
4759+
dst_ptl = huge_pte_lock(h, mm, dst_pte);
4760+
src_ptl = huge_pte_lockptr(h, mm, src_pte);
4761+
4762+
/*
4763+
* We don't have to worry about the ordering of src and dst ptlocks
4764+
* because exclusive mmap_sem (or the i_mmap_lock) prevents deadlock.
4765+
*/
4766+
if (src_ptl != dst_ptl)
4767+
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
4768+
4769+
pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
4770+
set_huge_pte_at(mm, new_addr, dst_pte, pte);
4771+
4772+
if (src_ptl != dst_ptl)
4773+
spin_unlock(src_ptl);
4774+
spin_unlock(dst_ptl);
4775+
}
4776+
4777+
int move_hugetlb_page_tables(struct vm_area_struct *vma,
4778+
struct vm_area_struct *new_vma,
4779+
unsigned long old_addr, unsigned long new_addr,
4780+
unsigned long len)
4781+
{
4782+
struct hstate *h = hstate_vma(vma);
4783+
struct address_space *mapping = vma->vm_file->f_mapping;
4784+
unsigned long sz = huge_page_size(h);
4785+
struct mm_struct *mm = vma->vm_mm;
4786+
unsigned long old_end = old_addr + len;
4787+
unsigned long old_addr_copy;
4788+
pte_t *src_pte, *dst_pte;
4789+
struct mmu_notifier_range range;
4790+
4791+
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
4792+
old_end);
4793+
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
4794+
mmu_notifier_invalidate_range_start(&range);
4795+
/* Prevent race with file truncation */
4796+
i_mmap_lock_write(mapping);
4797+
for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
4798+
src_pte = huge_pte_offset(mm, old_addr, sz);
4799+
if (!src_pte)
4800+
continue;
4801+
if (huge_pte_none(huge_ptep_get(src_pte)))
4802+
continue;
4803+
4804+
/* old_addr arg to huge_pmd_unshare() is a pointer and so the
4805+
* arg may be modified. Pass a copy instead to preserve the
4806+
* value in old_addr.
4807+
*/
4808+
old_addr_copy = old_addr;
4809+
4810+
if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte))
4811+
continue;
4812+
4813+
dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
4814+
if (!dst_pte)
4815+
break;
4816+
4817+
move_huge_pte(vma, old_addr, new_addr, src_pte);
4818+
}
4819+
i_mmap_unlock_write(mapping);
4820+
flush_tlb_range(vma, old_end - len, old_end);
4821+
mmu_notifier_invalidate_range_end(&range);
4822+
4823+
return len + old_addr - old_end;
4824+
}
4825+
47214826
static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
47224827
unsigned long start, unsigned long end,
47234828
struct page *ref_page)
@@ -6257,12 +6362,6 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
62576362
* sharing is possible. For hugetlbfs, this prevents removal of any page
62586363
* table entries associated with the address space. This is important as we
62596364
* are setting up sharing based on existing page table entries (mappings).
6260-
*
6261-
* NOTE: This routine is only called from huge_pte_alloc. Some callers of
6262-
* huge_pte_alloc know that sharing is not possible and do not take
6263-
* i_mmap_rwsem as a performance optimization. This is handled by the
6264-
* if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
6265-
* only required for subsequent processing.
62666365
*/
62676366
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
62686367
unsigned long addr, pud_t *pud)

mm/mremap.c

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
489489
old_end = old_addr + len;
490490
flush_cache_range(vma, old_addr, old_end);
491491

492+
if (is_vm_hugetlb_page(vma))
493+
return move_hugetlb_page_tables(vma, new_vma, old_addr,
494+
new_addr, len);
495+
492496
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
493497
old_addr, old_end);
494498
mmu_notifier_invalidate_range_start(&range);
@@ -646,6 +650,10 @@ static unsigned long move_vma(struct vm_area_struct *vma,
646650
mremap_userfaultfd_prep(new_vma, uf);
647651
}
648652

653+
if (is_vm_hugetlb_page(vma)) {
654+
clear_vma_resv_huge_pages(vma);
655+
}
656+
649657
/* Conceal VM_ACCOUNT so old reservation is not undone */
650658
if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
651659
vma->vm_flags &= ~VM_ACCOUNT;
@@ -739,9 +747,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
739747
(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
740748
return ERR_PTR(-EINVAL);
741749

742-
if (is_vm_hugetlb_page(vma))
743-
return ERR_PTR(-EINVAL);
744-
745750
/* We can't remap across vm area boundaries */
746751
if (old_len > vma->vm_end - addr)
747752
return ERR_PTR(-EFAULT);
@@ -937,6 +942,31 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
937942

938943
if (mmap_write_lock_killable(current->mm))
939944
return -EINTR;
945+
vma = find_vma(mm, addr);
946+
if (!vma || vma->vm_start > addr) {
947+
ret = EFAULT;
948+
goto out;
949+
}
950+
951+
if (is_vm_hugetlb_page(vma)) {
952+
struct hstate *h __maybe_unused = hstate_vma(vma);
953+
954+
old_len = ALIGN(old_len, huge_page_size(h));
955+
new_len = ALIGN(new_len, huge_page_size(h));
956+
957+
/* addrs must be huge page aligned */
958+
if (addr & ~huge_page_mask(h))
959+
goto out;
960+
if (new_addr & ~huge_page_mask(h))
961+
goto out;
962+
963+
/*
964+
* Don't allow remap expansion, because the underlying hugetlb
965+
* reservation is not yet capable to handle split reservation.
966+
*/
967+
if (new_len > old_len)
968+
goto out;
969+
}
940970

941971
if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
942972
ret = mremap_to(addr, old_len, new_addr, new_len,

0 commit comments

Comments
 (0)