Skip to content

Commit 6b34a09

Browse files
npigginmpe
authored andcommitted
powerpc/64s/hash: add stress_hpt kernel boot option to increase hash faults
This option increases the number of hash misses by limiting the number of kernel HPT entries, by keeping a per-CPU record of the last kernel HPTEs installed, and removing that from the hash table on the next hash insertion. A timer round-robins CPUs removing remaining kernel HPTEs and clearing the TLB (in the case of bare metal) to increase and slightly randomise kernel fault activity. Signed-off-by: Nicholas Piggin <[email protected]> [mpe: Add comment about NR_CPUS usage, fixup whitespace] Signed-off-by: Michael Ellerman <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent dfecd06 commit 6b34a09

File tree

5 files changed

+160
-1
lines changed

5 files changed

+160
-1
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1042,6 +1042,11 @@
10421042
them frequently to increase the rate of SLB faults
10431043
on kernel addresses.
10441044

1045+
stress_hpt [PPC]
1046+
Limits the number of kernel HPT entries in the hash
1047+
page table to increase the rate of hash page table
1048+
faults on kernel addresses.
1049+
10451050
disable= [IPV6]
10461051
See Documentation/networking/ipv6.rst.
10471052

arch/powerpc/mm/book3s64/hash_4k.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include <asm/machdep.h>
1717
#include <asm/mmu.h>
1818

19+
#include "internal.h"
20+
1921
int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
2022
pte_t *ptep, unsigned long trap, unsigned long flags,
2123
int ssize, int subpg_prot)
@@ -118,6 +120,9 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
118120
}
119121
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
120122
new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
123+
124+
if (stress_hpt())
125+
hpt_do_stress(ea, hpte_group);
121126
}
122127
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
123128
return 0;

arch/powerpc/mm/book3s64/hash_64k.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include <asm/machdep.h>
1717
#include <asm/mmu.h>
1818

19+
#include "internal.h"
20+
1921
/*
2022
* Return true, if the entry has a slot value which
2123
* the software considers as invalid.
@@ -216,6 +218,9 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
216218
new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
217219
new_pte |= H_PAGE_HASHPTE;
218220

221+
if (stress_hpt())
222+
hpt_do_stress(ea, hpte_group);
223+
219224
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
220225
return 0;
221226
}
@@ -327,7 +332,12 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
327332

328333
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
329334
new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
335+
336+
if (stress_hpt())
337+
hpt_do_stress(ea, hpte_group);
330338
}
339+
331340
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
341+
332342
return 0;
333343
}

arch/powerpc/mm/book3s64/hash_utils.c

Lines changed: 129 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend,
471471
return ret;
472472
}
473473

474-
static bool disable_1tb_segments = false;
474+
static bool disable_1tb_segments __ro_after_init;
475475

476476
static int __init parse_disable_1tb_segments(char *p)
477477
{
@@ -480,6 +480,40 @@ static int __init parse_disable_1tb_segments(char *p)
480480
}
481481
early_param("disable_1tb_segments", parse_disable_1tb_segments);
482482

483+
bool stress_hpt_enabled __initdata;
484+
485+
static int __init parse_stress_hpt(char *p)
486+
{
487+
stress_hpt_enabled = true;
488+
return 0;
489+
}
490+
early_param("stress_hpt", parse_stress_hpt);
491+
492+
__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_hpt_key);
493+
494+
/*
495+
* per-CPU array allocated if we enable stress_hpt.
496+
*/
497+
#define STRESS_MAX_GROUPS 16
498+
struct stress_hpt_struct {
499+
unsigned long last_group[STRESS_MAX_GROUPS];
500+
};
501+
502+
static inline int stress_nr_groups(void)
503+
{
504+
/*
505+
* LPAR H_REMOVE flushes TLB, so need some number > 1 of entries
506+
* to allow practical forward progress. Bare metal returns 1, which
507+
* seems to help uncover more bugs.
508+
*/
509+
if (firmware_has_feature(FW_FEATURE_LPAR))
510+
return STRESS_MAX_GROUPS;
511+
else
512+
return 1;
513+
}
514+
515+
static struct stress_hpt_struct *stress_hpt_struct;
516+
483517
static int __init htab_dt_scan_seg_sizes(unsigned long node,
484518
const char *uname, int depth,
485519
void *data)
@@ -976,6 +1010,23 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
9761010
pr_info("Partition table %p\n", partition_tb);
9771011
}
9781012

1013+
void hpt_clear_stress(void);
1014+
static struct timer_list stress_hpt_timer;
1015+
void stress_hpt_timer_fn(struct timer_list *timer)
1016+
{
1017+
int next_cpu;
1018+
1019+
hpt_clear_stress();
1020+
if (!firmware_has_feature(FW_FEATURE_LPAR))
1021+
tlbiel_all();
1022+
1023+
next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
1024+
if (next_cpu >= nr_cpu_ids)
1025+
next_cpu = cpumask_first(cpu_online_mask);
1026+
stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10);
1027+
add_timer_on(&stress_hpt_timer, next_cpu);
1028+
}
1029+
9791030
static void __init htab_initialize(void)
9801031
{
9811032
unsigned long table;
@@ -995,6 +1046,20 @@ static void __init htab_initialize(void)
9951046
if (stress_slb_enabled)
9961047
static_branch_enable(&stress_slb_key);
9971048

1049+
if (stress_hpt_enabled) {
1050+
unsigned long tmp;
1051+
static_branch_enable(&stress_hpt_key);
1052+
// Too early to use nr_cpu_ids, so use NR_CPUS
1053+
tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS,
1054+
0, 0, MEMBLOCK_ALLOC_ANYWHERE);
1055+
memset((void *)tmp, 0xff, sizeof(struct stress_hpt_struct) * NR_CPUS);
1056+
stress_hpt_struct = __va(tmp);
1057+
1058+
timer_setup(&stress_hpt_timer, stress_hpt_timer_fn, 0);
1059+
stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10);
1060+
add_timer(&stress_hpt_timer);
1061+
}
1062+
9981063
/*
9991064
* Calculate the required size of the htab. We want the number of
10001065
* PTEGs to equal one half the number of real pages.
@@ -1980,6 +2045,69 @@ long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
19802045
return slot;
19812046
}
19822047

2048+
void hpt_clear_stress(void)
2049+
{
2050+
int cpu = raw_smp_processor_id();
2051+
int g;
2052+
2053+
for (g = 0; g < stress_nr_groups(); g++) {
2054+
unsigned long last_group;
2055+
last_group = stress_hpt_struct[cpu].last_group[g];
2056+
2057+
if (last_group != -1UL) {
2058+
int i;
2059+
for (i = 0; i < HPTES_PER_GROUP; i++) {
2060+
if (mmu_hash_ops.hpte_remove(last_group) == -1)
2061+
break;
2062+
}
2063+
stress_hpt_struct[cpu].last_group[g] = -1;
2064+
}
2065+
}
2066+
}
2067+
2068+
void hpt_do_stress(unsigned long ea, unsigned long hpte_group)
2069+
{
2070+
unsigned long last_group;
2071+
int cpu = raw_smp_processor_id();
2072+
2073+
last_group = stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1];
2074+
if (hpte_group == last_group)
2075+
return;
2076+
2077+
if (last_group != -1UL) {
2078+
int i;
2079+
/*
2080+
* Concurrent CPUs might be inserting into this group, so
2081+
* give up after a number of iterations, to prevent a live
2082+
* lock.
2083+
*/
2084+
for (i = 0; i < HPTES_PER_GROUP; i++) {
2085+
if (mmu_hash_ops.hpte_remove(last_group) == -1)
2086+
break;
2087+
}
2088+
stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1] = -1;
2089+
}
2090+
2091+
if (ea >= PAGE_OFFSET) {
2092+
/*
2093+
* We would really like to prefetch to get the TLB loaded, then
2094+
* remove the PTE before returning from fault interrupt, to
2095+
* increase the hash fault rate.
2096+
*
2097+
* Unfortunately QEMU TCG does not model the TLB in a way that
2098+
* makes this possible, and systemsim (mambo) emulator does not
2099+
* bring in TLBs with prefetches (although loads/stores do
2100+
* work for non-CI PTEs).
2101+
*
2102+
* So remember this PTE and clear it on the next hash fault.
2103+
*/
2104+
memmove(&stress_hpt_struct[cpu].last_group[1],
2105+
&stress_hpt_struct[cpu].last_group[0],
2106+
(stress_nr_groups() - 1) * sizeof(unsigned long));
2107+
stress_hpt_struct[cpu].last_group[0] = hpte_group;
2108+
}
2109+
}
2110+
19832111
#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
19842112
static DEFINE_RAW_SPINLOCK(linear_map_hash_lock);
19852113

arch/powerpc/mm/book3s64/internal.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,17 @@ static inline bool stress_slb(void)
1313
return static_branch_unlikely(&stress_slb_key);
1414
}
1515

16+
extern bool stress_hpt_enabled;
17+
18+
DECLARE_STATIC_KEY_FALSE(stress_hpt_key);
19+
20+
static inline bool stress_hpt(void)
21+
{
22+
return static_branch_unlikely(&stress_hpt_key);
23+
}
24+
25+
void hpt_do_stress(unsigned long ea, unsigned long hpte_group);
26+
1627
void slb_setup_new_exec(void);
1728

1829
void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush);

0 commit comments

Comments
 (0)