Skip to content

Commit 8ee5382

Browse files
aagittorvalds
authored andcommitted
thp: mmu_notifier_test_young
For GRU and EPT, we need gup-fast to set referenced bit too (this is why it's correct to return 0 when shadow_access_mask is zero, it requires gup-fast to set the referenced bit). qemu-kvm access already sets the young bit in the pte if it isn't zero-copy, if it's zero copy or a shadow paging EPT minor fault we relay on gup-fast to signal the page is in use... We also need to check the young bits on the secondary pagetables for NPT and not nested shadow mmu as the data may never get accessed again by the primary pte. Without this closer accuracy, we'd have to remove the heuristic that avoids collapsing hugepages in hugepage virtual regions that have not even a single subpage in use. ->test_young is full backwards compatible with GRU and other usages that don't have young bits in pagetables set by the hardware and that should nuke the secondary mmu mappings when ->clear_flush_young runs just like EPT does. Removing the heuristic that checks the young bit in khugepaged/collapse_huge_page completely isn't so bad either probably but I thought it was worth it and this makes it reliable. Signed-off-by: Andrea Arcangeli <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 4b7167b commit 8ee5382

File tree

7 files changed

+105
-2
lines changed

7 files changed

+105
-2
lines changed

arch/x86/include/asm/kvm_host.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -822,6 +822,7 @@ extern bool kvm_rebooting;
822822
#define KVM_ARCH_WANT_MMU_NOTIFIER
823823
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
824824
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
825+
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
825826
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
826827
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
827828
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);

arch/x86/kvm/mmu.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -945,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
945945
return young;
946946
}
947947

948+
static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
949+
unsigned long data)
950+
{
951+
u64 *spte;
952+
int young = 0;
953+
954+
/*
955+
* If there's no access bit in the secondary pte set by the
956+
* hardware it's up to gup-fast/gup to set the access bit in
957+
* the primary pte or in the page structure.
958+
*/
959+
if (!shadow_accessed_mask)
960+
goto out;
961+
962+
spte = rmap_next(kvm, rmapp, NULL);
963+
while (spte) {
964+
u64 _spte = *spte;
965+
BUG_ON(!(_spte & PT_PRESENT_MASK));
966+
young = _spte & PT_ACCESSED_MASK;
967+
if (young) {
968+
young = 1;
969+
break;
970+
}
971+
spte = rmap_next(kvm, rmapp, spte);
972+
}
973+
out:
974+
return young;
975+
}
976+
948977
#define RMAP_RECYCLE_THRESHOLD 1000
949978

950979
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -965,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
965994
return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
966995
}
967996

997+
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
998+
{
999+
return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1000+
}
1001+
9681002
#ifdef MMU_DEBUG
9691003
static int is_empty_shadow_page(u64 *spt)
9701004
{

arch/x86/mm/gup.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include <linux/mm.h>
99
#include <linux/vmstat.h>
1010
#include <linux/highmem.h>
11+
#include <linux/swap.h>
1112

1213
#include <asm/pgtable.h>
1314

@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
8990
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
9091
page = pte_page(pte);
9192
get_page(page);
93+
SetPageReferenced(page);
9294
pages[*nr] = page;
9395
(*nr)++;
9496

@@ -103,6 +105,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
103105
VM_BUG_ON(page != compound_head(page));
104106
VM_BUG_ON(page_count(page) == 0);
105107
atomic_add(nr, &page->_count);
108+
SetPageReferenced(page);
106109
}
107110

108111
static inline void get_huge_page_tail(struct page *page)

include/linux/mmu_notifier.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,16 @@ struct mmu_notifier_ops {
6161
struct mm_struct *mm,
6262
unsigned long address);
6363

64+
/*
65+
* test_young is called to check the young/accessed bitflag in
66+
* the secondary pte. This is used to know if the page is
67+
* frequently used without actually clearing the flag or tearing
68+
* down the secondary mapping on the page.
69+
*/
70+
int (*test_young)(struct mmu_notifier *mn,
71+
struct mm_struct *mm,
72+
unsigned long address);
73+
6474
/*
6575
* change_pte is called in cases that pte mapping to page is changed:
6676
* for example, when ksm remaps pte to point to a new shared page.
@@ -163,6 +173,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
163173
extern void __mmu_notifier_release(struct mm_struct *mm);
164174
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
165175
unsigned long address);
176+
extern int __mmu_notifier_test_young(struct mm_struct *mm,
177+
unsigned long address);
166178
extern void __mmu_notifier_change_pte(struct mm_struct *mm,
167179
unsigned long address, pte_t pte);
168180
extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -186,6 +198,14 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
186198
return 0;
187199
}
188200

201+
static inline int mmu_notifier_test_young(struct mm_struct *mm,
202+
unsigned long address)
203+
{
204+
if (mm_has_notifiers(mm))
205+
return __mmu_notifier_test_young(mm, address);
206+
return 0;
207+
}
208+
189209
static inline void mmu_notifier_change_pte(struct mm_struct *mm,
190210
unsigned long address, pte_t pte)
191211
{
@@ -313,6 +333,12 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
313333
return 0;
314334
}
315335

336+
static inline int mmu_notifier_test_young(struct mm_struct *mm,
337+
unsigned long address)
338+
{
339+
return 0;
340+
}
341+
316342
static inline void mmu_notifier_change_pte(struct mm_struct *mm,
317343
unsigned long address, pte_t pte)
318344
{

mm/huge_memory.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1632,7 +1632,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
16321632
VM_BUG_ON(PageLRU(page));
16331633

16341634
/* If there is no mapped pte young don't collapse the page */
1635-
if (pte_young(pteval))
1635+
if (pte_young(pteval) || PageReferenced(page) ||
1636+
mmu_notifier_test_young(vma->vm_mm, address))
16361637
referenced = 1;
16371638
}
16381639
if (unlikely(!referenced))
@@ -1892,7 +1893,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
18921893
/* cannot use mapcount: can't collapse if there's a gup pin */
18931894
if (page_count(page) != 1)
18941895
goto out_unmap;
1895-
if (pte_young(pteval))
1896+
if (pte_young(pteval) || PageReferenced(page) ||
1897+
mmu_notifier_test_young(vma->vm_mm, address))
18961898
referenced = 1;
18971899
}
18981900
if (referenced)

mm/mmu_notifier.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
100100
return young;
101101
}
102102

103+
int __mmu_notifier_test_young(struct mm_struct *mm,
104+
unsigned long address)
105+
{
106+
struct mmu_notifier *mn;
107+
struct hlist_node *n;
108+
int young = 0;
109+
110+
rcu_read_lock();
111+
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
112+
if (mn->ops->test_young) {
113+
young = mn->ops->test_young(mn, mm, address);
114+
if (young)
115+
break;
116+
}
117+
}
118+
rcu_read_unlock();
119+
120+
return young;
121+
}
122+
103123
void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
104124
pte_t pte)
105125
{

virt/kvm/kvm_main.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,22 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
380380
return young;
381381
}
382382

383+
static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
384+
struct mm_struct *mm,
385+
unsigned long address)
386+
{
387+
struct kvm *kvm = mmu_notifier_to_kvm(mn);
388+
int young, idx;
389+
390+
idx = srcu_read_lock(&kvm->srcu);
391+
spin_lock(&kvm->mmu_lock);
392+
young = kvm_test_age_hva(kvm, address);
393+
spin_unlock(&kvm->mmu_lock);
394+
srcu_read_unlock(&kvm->srcu, idx);
395+
396+
return young;
397+
}
398+
383399
static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
384400
struct mm_struct *mm)
385401
{
@@ -396,6 +412,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
396412
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
397413
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
398414
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
415+
.test_young = kvm_mmu_notifier_test_young,
399416
.change_pte = kvm_mmu_notifier_change_pte,
400417
.release = kvm_mmu_notifier_release,
401418
};

0 commit comments

Comments
 (0)