Skip to content

Commit f5d54a4

Browse files
committed
Merge tag 'x86_urgent_for_v5.16_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 fixes from Borislav Petkov: - Fix a couple of SWAPGS fencing issues in the x86 entry code - Use the proper operand types in __{get,put}_user() to prevent truncation in SEV-ES string io - Make sure the kernel mappings are present in trampoline_pgd in order to prevent any potential accesses to unmapped memory after switching to it - Fix a trivial list corruption in objtool's pv_ops validation - Disable the clocksource watchdog for TSC on platforms which claim that the TSC is constant, doesn't stop in sleep states, CPU has TSC adjust and the number of sockets of the platform are max 2, to prevent erroneous markings of the TSC as unstable. - Make sure TSC adjust is always checked not only when going idle - Prevent a stack leak by initializing struct _fpx_sw_bytes properly in the FPU code - Fix INTEL_FAM6_RAPTORLAKE define naming to adhere to the convention * tag 'x86_urgent_for_v5.16_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/xen: Add xenpv_restore_regs_and_return_to_usermode() x86/entry: Use the correct fence macro after swapgs in kernel CR3 x86/entry: Add a fence for kernel entry SWAPGS in paranoid_entry() x86/sev: Fix SEV-ES INS/OUTS instructions for word, dword, and qword x86/64/mm: Map all kernel memory into trampoline_pgd objtool: Fix pv_ops noinstr validation x86/tsc: Disable clocksource watchdog for TSC on qualified platorms x86/tsc: Add a timer to make sure TSC_adjust is always checked x86/fpu/signal: Initialize sw_bytes in save_xstate_epilog() x86/cpu: Drop spurious underscore from RAPTOR_LAKE #define
2 parents 90bf8d9 + 5c8f6a2 commit f5d54a4

File tree

10 files changed

+159
-43
lines changed

10 files changed

+159
-43
lines changed

arch/x86/entry/entry_64.S

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,10 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
574574
ud2
575575
1:
576576
#endif
577+
#ifdef CONFIG_XEN_PV
578+
ALTERNATIVE "", "jmp xenpv_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
579+
#endif
580+
577581
POP_REGS pop_rdi=0
578582

579583
/*
@@ -890,28 +894,22 @@ SYM_CODE_START_LOCAL(paranoid_entry)
890894
.Lparanoid_entry_checkgs:
891895
/* EBX = 1 -> kernel GSBASE active, no restore required */
892896
movl $1, %ebx
897+
893898
/*
894899
* The kernel-enforced convention is a negative GSBASE indicates
895900
* a kernel value. No SWAPGS needed on entry and exit.
896901
*/
897902
movl $MSR_GS_BASE, %ecx
898903
rdmsr
899904
testl %edx, %edx
900-
jns .Lparanoid_entry_swapgs
901-
ret
905+
js .Lparanoid_kernel_gsbase
902906

903-
.Lparanoid_entry_swapgs:
907+
/* EBX = 0 -> SWAPGS required on exit */
908+
xorl %ebx, %ebx
904909
swapgs
910+
.Lparanoid_kernel_gsbase:
905911

906-
/*
907-
* The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
908-
* unconditional CR3 write, even in the PTI case. So do an lfence
909-
* to prevent GS speculation, regardless of whether PTI is enabled.
910-
*/
911912
FENCE_SWAPGS_KERNEL_ENTRY
912-
913-
/* EBX = 0 -> SWAPGS required on exit */
914-
xorl %ebx, %ebx
915913
ret
916914
SYM_CODE_END(paranoid_entry)
917915

@@ -993,11 +991,6 @@ SYM_CODE_START_LOCAL(error_entry)
993991
pushq %r12
994992
ret
995993

996-
.Lerror_entry_done_lfence:
997-
FENCE_SWAPGS_KERNEL_ENTRY
998-
.Lerror_entry_done:
999-
ret
1000-
1001994
/*
1002995
* There are two places in the kernel that can potentially fault with
1003996
* usergs. Handle them here. B stepping K8s sometimes report a
@@ -1020,8 +1013,14 @@ SYM_CODE_START_LOCAL(error_entry)
10201013
* .Lgs_change's error handler with kernel gsbase.
10211014
*/
10221015
SWAPGS
1023-
FENCE_SWAPGS_USER_ENTRY
1024-
jmp .Lerror_entry_done
1016+
1017+
/*
1018+
* Issue an LFENCE to prevent GS speculation, regardless of whether it is a
1019+
* kernel or user gsbase.
1020+
*/
1021+
.Lerror_entry_done_lfence:
1022+
FENCE_SWAPGS_KERNEL_ENTRY
1023+
ret
10251024

10261025
.Lbstep_iret:
10271026
/* Fix truncated RIP */

arch/x86/include/asm/intel-family.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@
108108
#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
109109
#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
110110

111-
#define INTEL_FAM6_RAPTOR_LAKE 0xB7
111+
#define INTEL_FAM6_RAPTORLAKE 0xB7
112112

113113
/* "Small Core" Processors (Atom) */
114114

arch/x86/kernel/fpu/signal.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
118118
struct fpstate *fpstate)
119119
{
120120
struct xregs_state __user *x = buf;
121-
struct _fpx_sw_bytes sw_bytes;
121+
struct _fpx_sw_bytes sw_bytes = {};
122122
u32 xfeatures;
123123
int err;
124124

arch/x86/kernel/sev.c

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -294,11 +294,6 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
294294
char *dst, char *buf, size_t size)
295295
{
296296
unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
297-
char __user *target = (char __user *)dst;
298-
u64 d8;
299-
u32 d4;
300-
u16 d2;
301-
u8 d1;
302297

303298
/*
304299
* This function uses __put_user() independent of whether kernel or user
@@ -320,26 +315,42 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
320315
* instructions here would cause infinite nesting.
321316
*/
322317
switch (size) {
323-
case 1:
318+
case 1: {
319+
u8 d1;
320+
u8 __user *target = (u8 __user *)dst;
321+
324322
memcpy(&d1, buf, 1);
325323
if (__put_user(d1, target))
326324
goto fault;
327325
break;
328-
case 2:
326+
}
327+
case 2: {
328+
u16 d2;
329+
u16 __user *target = (u16 __user *)dst;
330+
329331
memcpy(&d2, buf, 2);
330332
if (__put_user(d2, target))
331333
goto fault;
332334
break;
333-
case 4:
335+
}
336+
case 4: {
337+
u32 d4;
338+
u32 __user *target = (u32 __user *)dst;
339+
334340
memcpy(&d4, buf, 4);
335341
if (__put_user(d4, target))
336342
goto fault;
337343
break;
338-
case 8:
344+
}
345+
case 8: {
346+
u64 d8;
347+
u64 __user *target = (u64 __user *)dst;
348+
339349
memcpy(&d8, buf, 8);
340350
if (__put_user(d8, target))
341351
goto fault;
342352
break;
353+
}
343354
default:
344355
WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
345356
return ES_UNSUPPORTED;
@@ -362,11 +373,6 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
362373
char *src, char *buf, size_t size)
363374
{
364375
unsigned long error_code = X86_PF_PROT;
365-
char __user *s = (char __user *)src;
366-
u64 d8;
367-
u32 d4;
368-
u16 d2;
369-
u8 d1;
370376

371377
/*
372378
* This function uses __get_user() independent of whether kernel or user
@@ -388,26 +394,41 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
388394
* instructions here would cause infinite nesting.
389395
*/
390396
switch (size) {
391-
case 1:
397+
case 1: {
398+
u8 d1;
399+
u8 __user *s = (u8 __user *)src;
400+
392401
if (__get_user(d1, s))
393402
goto fault;
394403
memcpy(buf, &d1, 1);
395404
break;
396-
case 2:
405+
}
406+
case 2: {
407+
u16 d2;
408+
u16 __user *s = (u16 __user *)src;
409+
397410
if (__get_user(d2, s))
398411
goto fault;
399412
memcpy(buf, &d2, 2);
400413
break;
401-
case 4:
414+
}
415+
case 4: {
416+
u32 d4;
417+
u32 __user *s = (u32 __user *)src;
418+
402419
if (__get_user(d4, s))
403420
goto fault;
404421
memcpy(buf, &d4, 4);
405422
break;
406-
case 8:
423+
}
424+
case 8: {
425+
u64 d8;
426+
u64 __user *s = (u64 __user *)src;
407427
if (__get_user(d8, s))
408428
goto fault;
409429
memcpy(buf, &d8, 8);
410430
break;
431+
}
411432
default:
412433
WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
413434
return ES_UNSUPPORTED;

arch/x86/kernel/tsc.c

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1180,6 +1180,12 @@ void mark_tsc_unstable(char *reason)
11801180

11811181
EXPORT_SYMBOL_GPL(mark_tsc_unstable);
11821182

1183+
static void __init tsc_disable_clocksource_watchdog(void)
1184+
{
1185+
clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
1186+
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
1187+
}
1188+
11831189
static void __init check_system_tsc_reliable(void)
11841190
{
11851191
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
@@ -1196,6 +1202,23 @@ static void __init check_system_tsc_reliable(void)
11961202
#endif
11971203
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
11981204
tsc_clocksource_reliable = 1;
1205+
1206+
/*
1207+
* Disable the clocksource watchdog when the system has:
1208+
* - TSC running at constant frequency
1209+
* - TSC which does not stop in C-States
1210+
* - the TSC_ADJUST register which allows to detect even minimal
1211+
* modifications
1212+
* - not more than two sockets. As the number of sockets cannot be
1213+
* evaluated at the early boot stage where this has to be
1214+
* invoked, check the number of online memory nodes as a
1215+
* fallback solution which is an reasonable estimate.
1216+
*/
1217+
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
1218+
boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
1219+
boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
1220+
nr_online_nodes <= 2)
1221+
tsc_disable_clocksource_watchdog();
11991222
}
12001223

12011224
/*
@@ -1387,9 +1410,6 @@ static int __init init_tsc_clocksource(void)
13871410
if (tsc_unstable)
13881411
goto unreg;
13891412

1390-
if (tsc_clocksource_reliable || no_tsc_watchdog)
1391-
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
1392-
13931413
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
13941414
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
13951415

@@ -1527,7 +1547,7 @@ void __init tsc_init(void)
15271547
}
15281548

15291549
if (tsc_clocksource_reliable || no_tsc_watchdog)
1530-
clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
1550+
tsc_disable_clocksource_watchdog();
15311551

15321552
clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
15331553
detect_art();

arch/x86/kernel/tsc_sync.c

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ struct tsc_adjust {
3030
};
3131

3232
static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
33+
static struct timer_list tsc_sync_check_timer;
3334

3435
/*
3536
* TSC's on different sockets may be reset asynchronously.
@@ -77,6 +78,46 @@ void tsc_verify_tsc_adjust(bool resume)
7778
}
7879
}
7980

81+
/*
82+
* Normally the tsc_sync will be checked every time system enters idle
83+
* state, but there is still caveat that a system won't enter idle,
84+
* either because it's too busy or configured purposely to not enter
85+
* idle.
86+
*
87+
* So setup a periodic timer (every 10 minutes) to make sure the check
88+
* is always on.
89+
*/
90+
91+
#define SYNC_CHECK_INTERVAL (HZ * 600)
92+
93+
static void tsc_sync_check_timer_fn(struct timer_list *unused)
94+
{
95+
int next_cpu;
96+
97+
tsc_verify_tsc_adjust(false);
98+
99+
/* Run the check for all onlined CPUs in turn */
100+
next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
101+
if (next_cpu >= nr_cpu_ids)
102+
next_cpu = cpumask_first(cpu_online_mask);
103+
104+
tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL;
105+
add_timer_on(&tsc_sync_check_timer, next_cpu);
106+
}
107+
108+
static int __init start_sync_check_timer(void)
109+
{
110+
if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable)
111+
return 0;
112+
113+
timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0);
114+
tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL;
115+
add_timer(&tsc_sync_check_timer);
116+
117+
return 0;
118+
}
119+
late_initcall(start_sync_check_timer);
120+
80121
static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
81122
unsigned int cpu, bool bootcpu)
82123
{

arch/x86/realmode/init.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ static void __init setup_real_mode(void)
7272
#ifdef CONFIG_X86_64
7373
u64 *trampoline_pgd;
7474
u64 efer;
75+
int i;
7576
#endif
7677

7778
base = (unsigned char *)real_mode_header;
@@ -128,8 +129,17 @@ static void __init setup_real_mode(void)
128129
trampoline_header->flags = 0;
129130

130131
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
132+
133+
/* Map the real mode stub as virtual == physical */
131134
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
132-
trampoline_pgd[511] = init_top_pgt[511].pgd;
135+
136+
/*
137+
* Include the entirety of the kernel mapping into the trampoline
138+
* PGD. This way, all mappings present in the normal kernel page
139+
* tables are usable while running on trampoline_pgd.
140+
*/
141+
for (i = pgd_index(__PAGE_OFFSET); i < PTRS_PER_PGD; i++)
142+
trampoline_pgd[i] = init_top_pgt[i].pgd;
133143
#endif
134144

135145
sme_sev_setup_real_mode(trampoline_header);

arch/x86/xen/xen-asm.S

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
#include <linux/init.h>
2222
#include <linux/linkage.h>
23+
#include <../entry/calling.h>
2324

2425
.pushsection .noinstr.text, "ax"
2526
/*
@@ -192,6 +193,25 @@ SYM_CODE_START(xen_iret)
192193
jmp hypercall_iret
193194
SYM_CODE_END(xen_iret)
194195

196+
/*
197+
* XEN pv doesn't use trampoline stack, PER_CPU_VAR(cpu_tss_rw + TSS_sp0) is
198+
* also the kernel stack. Reusing swapgs_restore_regs_and_return_to_usermode()
199+
* in XEN pv would cause %rsp to move up to the top of the kernel stack and
200+
* leave the IRET frame below %rsp, which is dangerous to be corrupted if #NMI
201+
* interrupts. And swapgs_restore_regs_and_return_to_usermode() pushing the IRET
202+
* frame at the same address is useless.
203+
*/
204+
SYM_CODE_START(xenpv_restore_regs_and_return_to_usermode)
205+
UNWIND_HINT_REGS
206+
POP_REGS
207+
208+
/* stackleak_erase() can work safely on the kernel stack. */
209+
STACKLEAK_ERASE_NOCLOBBER
210+
211+
addq $8, %rsp /* skip regs->orig_ax */
212+
jmp xen_iret
213+
SYM_CODE_END(xenpv_restore_regs_and_return_to_usermode)
214+
195215
/*
196216
* Xen handles syscall callbacks much like ordinary exceptions, which
197217
* means we have:

tools/objtool/elf.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,7 @@ static int read_symbols(struct elf *elf)
375375
return -1;
376376
}
377377
memset(sym, 0, sizeof(*sym));
378+
INIT_LIST_HEAD(&sym->pv_target);
378379
sym->alias = sym;
379380

380381
sym->idx = i;

tools/objtool/objtool.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,10 @@ void objtool_pv_add(struct objtool_file *f, int idx, struct symbol *func)
153153
!strcmp(func->name, "_paravirt_ident_64"))
154154
return;
155155

156+
/* already added this function */
157+
if (!list_empty(&func->pv_target))
158+
return;
159+
156160
list_add(&func->pv_target, &f->pv_ops[idx].targets);
157161
f->pv_ops[idx].clean = false;
158162
}

0 commit comments

Comments
 (0)