@@ -471,7 +471,7 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend,
471471 return ret ;
472472}
473473
474- static bool disable_1tb_segments = false ;
474+ static bool disable_1tb_segments __ro_after_init ;
475475
476476static int __init parse_disable_1tb_segments (char * p )
477477{
@@ -480,6 +480,40 @@ static int __init parse_disable_1tb_segments(char *p)
480480}
481481early_param ("disable_1tb_segments" , parse_disable_1tb_segments );
482482
483+ bool stress_hpt_enabled __initdata ;
484+
485+ static int __init parse_stress_hpt (char * p )
486+ {
487+ stress_hpt_enabled = true;
488+ return 0 ;
489+ }
490+ early_param ("stress_hpt" , parse_stress_hpt );
491+
492+ __ro_after_init DEFINE_STATIC_KEY_FALSE (stress_hpt_key );
493+
494+ /*
495+ * per-CPU array allocated if we enable stress_hpt.
496+ */
497+ #define STRESS_MAX_GROUPS 16
498+ struct stress_hpt_struct {
499+ unsigned long last_group [STRESS_MAX_GROUPS ];
500+ };
501+
502+ static inline int stress_nr_groups (void )
503+ {
504+ /*
505+ * LPAR H_REMOVE flushes TLB, so need some number > 1 of entries
506+ * to allow practical forward progress. Bare metal returns 1, which
507+ * seems to help uncover more bugs.
508+ */
509+ if (firmware_has_feature (FW_FEATURE_LPAR ))
510+ return STRESS_MAX_GROUPS ;
511+ else
512+ return 1 ;
513+ }
514+
515+ static struct stress_hpt_struct * stress_hpt_struct ;
516+
483517static int __init htab_dt_scan_seg_sizes (unsigned long node ,
484518 const char * uname , int depth ,
485519 void * data )
@@ -976,6 +1010,23 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
9761010 pr_info ("Partition table %p\n" , partition_tb );
9771011}
9781012
1013+ void hpt_clear_stress (void );
1014+ static struct timer_list stress_hpt_timer ;
1015+ void stress_hpt_timer_fn (struct timer_list * timer )
1016+ {
1017+ int next_cpu ;
1018+
1019+ hpt_clear_stress ();
1020+ if (!firmware_has_feature (FW_FEATURE_LPAR ))
1021+ tlbiel_all ();
1022+
1023+ next_cpu = cpumask_next (raw_smp_processor_id (), cpu_online_mask );
1024+ if (next_cpu >= nr_cpu_ids )
1025+ next_cpu = cpumask_first (cpu_online_mask );
1026+ stress_hpt_timer .expires = jiffies + msecs_to_jiffies (10 );
1027+ add_timer_on (& stress_hpt_timer , next_cpu );
1028+ }
1029+
9791030static void __init htab_initialize (void )
9801031{
9811032 unsigned long table ;
@@ -995,6 +1046,20 @@ static void __init htab_initialize(void)
9951046 if (stress_slb_enabled )
9961047 static_branch_enable (& stress_slb_key );
9971048
1049+ if (stress_hpt_enabled ) {
1050+ unsigned long tmp ;
1051+ static_branch_enable (& stress_hpt_key );
1052+ // Too early to use nr_cpu_ids, so use NR_CPUS
1053+ tmp = memblock_phys_alloc_range (sizeof (struct stress_hpt_struct ) * NR_CPUS ,
1054+ 0 , 0 , MEMBLOCK_ALLOC_ANYWHERE );
1055+ memset ((void * )tmp , 0xff , sizeof (struct stress_hpt_struct ) * NR_CPUS );
1056+ stress_hpt_struct = __va (tmp );
1057+
1058+ timer_setup (& stress_hpt_timer , stress_hpt_timer_fn , 0 );
1059+ stress_hpt_timer .expires = jiffies + msecs_to_jiffies (10 );
1060+ add_timer (& stress_hpt_timer );
1061+ }
1062+
9981063 /*
9991064 * Calculate the required size of the htab. We want the number of
10001065 * PTEGs to equal one half the number of real pages.
@@ -1980,6 +2045,69 @@ long hpte_insert_repeating(unsigned long hash, unsigned long vpn,
19802045 return slot ;
19812046}
19822047
2048+ void hpt_clear_stress (void )
2049+ {
2050+ int cpu = raw_smp_processor_id ();
2051+ int g ;
2052+
2053+ for (g = 0 ; g < stress_nr_groups (); g ++ ) {
2054+ unsigned long last_group ;
2055+ last_group = stress_hpt_struct [cpu ].last_group [g ];
2056+
2057+ if (last_group != -1UL ) {
2058+ int i ;
2059+ for (i = 0 ; i < HPTES_PER_GROUP ; i ++ ) {
2060+ if (mmu_hash_ops .hpte_remove (last_group ) == -1 )
2061+ break ;
2062+ }
2063+ stress_hpt_struct [cpu ].last_group [g ] = -1 ;
2064+ }
2065+ }
2066+ }
2067+
2068+ void hpt_do_stress (unsigned long ea , unsigned long hpte_group )
2069+ {
2070+ unsigned long last_group ;
2071+ int cpu = raw_smp_processor_id ();
2072+
2073+ last_group = stress_hpt_struct [cpu ].last_group [stress_nr_groups () - 1 ];
2074+ if (hpte_group == last_group )
2075+ return ;
2076+
2077+ if (last_group != -1UL ) {
2078+ int i ;
2079+ /*
2080+ * Concurrent CPUs might be inserting into this group, so
2081+ * give up after a number of iterations, to prevent a live
2082+ * lock.
2083+ */
2084+ for (i = 0 ; i < HPTES_PER_GROUP ; i ++ ) {
2085+ if (mmu_hash_ops .hpte_remove (last_group ) == -1 )
2086+ break ;
2087+ }
2088+ stress_hpt_struct [cpu ].last_group [stress_nr_groups () - 1 ] = -1 ;
2089+ }
2090+
2091+ if (ea >= PAGE_OFFSET ) {
2092+ /*
2093+ * We would really like to prefetch to get the TLB loaded, then
2094+ * remove the PTE before returning from fault interrupt, to
2095+ * increase the hash fault rate.
2096+ *
2097+ * Unfortunately QEMU TCG does not model the TLB in a way that
2098+ * makes this possible, and systemsim (mambo) emulator does not
2099+ * bring in TLBs with prefetches (although loads/stores do
2100+ * work for non-CI PTEs).
2101+ *
2102+ * So remember this PTE and clear it on the next hash fault.
2103+ */
2104+ memmove (& stress_hpt_struct [cpu ].last_group [1 ],
2105+ & stress_hpt_struct [cpu ].last_group [0 ],
2106+ (stress_nr_groups () - 1 ) * sizeof (unsigned long ));
2107+ stress_hpt_struct [cpu ].last_group [0 ] = hpte_group ;
2108+ }
2109+ }
2110+
19832111#if defined(CONFIG_DEBUG_PAGEALLOC ) || defined(CONFIG_KFENCE )
19842112static DEFINE_RAW_SPINLOCK (linear_map_hash_lock );
19852113
0 commit comments