Skip to content

Commit c46a7c8

Browse files
Mel Gormantorvalds
authored andcommitted
x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels
_PAGE_NUMA is currently an alias of _PROT_PROTNONE to trap NUMA hinting faults on x86. Care is taken such that _PAGE_NUMA is used only in situations where the VMA flags distinguish between NUMA hinting faults and prot_none faults. This decision was x86-specific and conceptually it is difficult requiring special casing to distinguish between PROTNONE and NUMA ptes based on context. Fundamentally, we only need the _PAGE_NUMA bit to tell the difference between an entry that is really unmapped and a page that is protected for NUMA hinting faults as if the PTE is not present then a fault will be trapped. Swap PTEs on x86-64 use the bits after _PAGE_GLOBAL for the offset. This patch shrinks the maximum possible swap size and uses the bit to uniquely distinguish between NUMA hinting ptes and swap ptes. Signed-off-by: Mel Gorman <[email protected]> Cc: David Vrabel <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Peter Anvin <[email protected]> Cc: Fengguang Wu <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Steven Noonan <[email protected]> Cc: Rik van Riel <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Andrea Arcangeli <[email protected]> Cc: Dave Hansen <[email protected]> Cc: Srikar Dronamraju <[email protected]> Cc: Cyrill Gorcunov <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 4468dd7 commit c46a7c8

File tree

8 files changed

+75
-49
lines changed

8 files changed

+75
-49
lines changed

arch/powerpc/include/asm/pgtable.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@ static inline int pte_present(pte_t pte)
4444
return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA);
4545
}
4646

47+
#define pte_present_nonuma pte_present_nonuma
48+
static inline int pte_present_nonuma(pte_t pte)
49+
{
50+
return pte_val(pte) & (_PAGE_PRESENT);
51+
}
52+
4753
#define pte_numa pte_numa
4854
static inline int pte_numa(pte_t pte)
4955
{

arch/x86/include/asm/pgtable.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,8 @@ static inline int pte_exec(pte_t pte)
131131

132132
static inline int pte_special(pte_t pte)
133133
{
134-
return pte_flags(pte) & _PAGE_SPECIAL;
134+
return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) ==
135+
(_PAGE_PRESENT|_PAGE_SPECIAL);
135136
}
136137

137138
static inline unsigned long pte_pfn(pte_t pte)
@@ -452,6 +453,12 @@ static inline int pte_present(pte_t a)
452453
_PAGE_NUMA);
453454
}
454455

456+
#define pte_present_nonuma pte_present_nonuma
457+
static inline int pte_present_nonuma(pte_t a)
458+
{
459+
return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
460+
}
461+
455462
#define pte_accessible pte_accessible
456463
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
457464
{
@@ -860,19 +867,19 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
860867

861868
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
862869
{
863-
VM_BUG_ON(pte_present(pte));
870+
VM_BUG_ON(pte_present_nonuma(pte));
864871
return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
865872
}
866873

867874
static inline int pte_swp_soft_dirty(pte_t pte)
868875
{
869-
VM_BUG_ON(pte_present(pte));
876+
VM_BUG_ON(pte_present_nonuma(pte));
870877
return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
871878
}
872879

873880
static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
874881
{
875-
VM_BUG_ON(pte_present(pte));
882+
VM_BUG_ON(pte_present_nonuma(pte));
876883
return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
877884
}
878885

arch/x86/include/asm/pgtable_64.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,16 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
145145
/* Encode and de-code a swap entry */
146146
#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
147147
#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
148+
#ifdef CONFIG_NUMA_BALANCING
149+
/* Automatic NUMA balancing needs to be distinguishable from swap entries */
150+
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
151+
#else
148152
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
153+
#endif
149154
#else
155+
#ifdef CONFIG_NUMA_BALANCING
156+
#error Incompatible format for automatic NUMA balancing
157+
#endif
150158
#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
151159
#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
152160
#endif

arch/x86/include/asm/pgtable_types.h

Lines changed: 35 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,26 @@
1616
#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
1717
#define _PAGE_BIT_PAT 7 /* on 4KB pages */
1818
#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
19-
#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
20-
#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
21-
#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */
19+
#define _PAGE_BIT_SOFTW1 9 /* available for programmer */
20+
#define _PAGE_BIT_SOFTW2 10 /* " */
21+
#define _PAGE_BIT_SOFTW3 11 /* " */
2222
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23-
#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
24-
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
25-
#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
23+
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
24+
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
25+
#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
26+
#define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */
27+
#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
28+
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
2629
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
2730

31+
/*
32+
* Swap offsets on configurations that allow automatic NUMA balancing use the
33+
* bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
34+
* swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
35+
* maximum possible swap space from 16TB to 8TB.
36+
*/
37+
#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1)
38+
2839
/* If _PAGE_BIT_PRESENT is clear, we use these: */
2940
/* - if the user mapped it with PROT_NONE; pte_present gives true */
3041
#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
@@ -40,7 +51,7 @@
4051
#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
4152
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
4253
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
43-
#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
54+
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
4455
#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
4556
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
4657
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
@@ -61,14 +72,27 @@
6172
* they do not conflict with each other.
6273
*/
6374

64-
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN
65-
6675
#ifdef CONFIG_MEM_SOFT_DIRTY
6776
#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
6877
#else
6978
#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0))
7079
#endif
7180

81+
/*
82+
* _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
83+
* that is not present. The hinting fault gathers numa placement statistics
84+
* (see pte_numa()). The bit is always zero when the PTE is not present.
85+
*
86+
* The bit picked must be always zero when the pmd is present and not
87+
* present, so that we don't lose information when we set it while
88+
* atomically clearing the present bit.
89+
*/
90+
#ifdef CONFIG_NUMA_BALANCING
91+
#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
92+
#else
93+
#define _PAGE_NUMA (_AT(pteval_t, 0))
94+
#endif
95+
7296
/*
7397
* Tracking soft dirty bit when a page goes to a swap is tricky.
7498
* We need a bit which can be stored in pte _and_ not conflict
@@ -94,26 +118,6 @@
94118
#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
95119
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
96120

97-
/*
98-
* _PAGE_NUMA indicates that this page will trigger a numa hinting
99-
* minor page fault to gather numa placement statistics (see
100-
* pte_numa()). The bit picked (8) is within the range between
101-
* _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't
102-
* require changes to the swp entry format because that bit is always
103-
* zero when the pte is not present.
104-
*
105-
* The bit picked must be always zero when the pmd is present and not
106-
* present, so that we don't lose information when we set it while
107-
* atomically clearing the present bit.
108-
*
109-
* Because we shared the same bit (8) with _PAGE_PROTNONE this can be
110-
* interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE
111-
* couldn't reach, like handle_mm_fault() (see access_error in
112-
* arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for
113-
* handle_mm_fault() to be invoked).
114-
*/
115-
#define _PAGE_NUMA _PAGE_PROTNONE
116-
117121
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
118122
_PAGE_ACCESSED | _PAGE_DIRTY)
119123
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
@@ -122,8 +126,8 @@
122126
/* Set of bits not changed in pte_modify */
123127
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
124128
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
125-
_PAGE_SOFT_DIRTY)
126-
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
129+
_PAGE_SOFT_DIRTY | _PAGE_NUMA)
130+
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)
127131

128132
#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
129133
#define _PAGE_CACHE_WB (0)

arch/x86/mm/pageattr-test.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ enum {
3535

3636
static int pte_testbit(pte_t pte)
3737
{
38-
return pte_flags(pte) & _PAGE_UNUSED1;
38+
return pte_flags(pte) & _PAGE_SOFTW1;
3939
}
4040

4141
struct split_state {

include/asm-generic/pgtable.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
233233
# define pte_accessible(mm, pte) ((void)(pte), 1)
234234
#endif
235235

236+
#ifndef pte_present_nonuma
237+
#define pte_present_nonuma(pte) pte_present(pte)
238+
#endif
239+
236240
#ifndef flush_tlb_fix_spurious_fault
237241
#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
238242
#endif
@@ -670,15 +674,15 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
670674
static inline int pte_numa(pte_t pte)
671675
{
672676
return (pte_flags(pte) &
673-
(_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
677+
(_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
674678
}
675679
#endif
676680

677681
#ifndef pmd_numa
678682
static inline int pmd_numa(pmd_t pmd)
679683
{
680684
return (pmd_flags(pmd) &
681-
(_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
685+
(_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
682686
}
683687
#endif
684688

include/linux/swapops.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
5454
/* check whether a pte points to a swap entry */
5555
static inline int is_swap_pte(pte_t pte)
5656
{
57-
return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
57+
return !pte_none(pte) && !pte_present_nonuma(pte) && !pte_file(pte);
5858
}
5959
#endif
6060

mm/memory.c

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -756,7 +756,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
756756
unsigned long pfn = pte_pfn(pte);
757757

758758
if (HAVE_PTE_SPECIAL) {
759-
if (likely(!pte_special(pte)))
759+
if (likely(!pte_special(pte) || pte_numa(pte)))
760760
goto check_pfn;
761761
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
762762
return NULL;
@@ -782,14 +782,15 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
782782
}
783783
}
784784

785-
if (is_zero_pfn(pfn))
786-
return NULL;
787785
check_pfn:
788786
if (unlikely(pfn > highest_memmap_pfn)) {
789787
print_bad_pte(vma, addr, pte, NULL);
790788
return NULL;
791789
}
792790

791+
if (is_zero_pfn(pfn))
792+
return NULL;
793+
793794
/*
794795
* NOTE! We still have PageReserved() pages in the page tables.
795796
* eg. VDSO mappings can cause them to exist.
@@ -1722,13 +1723,9 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
17221723
VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
17231724

17241725
/*
1725-
* If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
1726-
* would be called on PROT_NONE ranges. We must never invoke
1727-
* handle_mm_fault on PROT_NONE ranges or the NUMA hinting
1728-
* page faults would unprotect the PROT_NONE ranges if
1729-
* _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
1730-
* bitflag. So to avoid that, don't set FOLL_NUMA if
1731-
* FOLL_FORCE is set.
1726+
* If FOLL_FORCE is set then do not force a full fault as the hinting
1727+
* fault information is unrelated to the reference behaviour of a task
1728+
* using the address space
17321729
*/
17331730
if (!(gup_flags & FOLL_FORCE))
17341731
gup_flags |= FOLL_NUMA;

0 commit comments

Comments
 (0)