Skip to content

Commit bb8c13d

Browse files
suryasaimadhuKAGA-KOKO
authored andcommitted
x86/microcode: Fix CPU synchronization routine
Emanuel reported an issue with a hang during microcode update because my dumb idea to use one atomic synchronization variable for both rendezvous - before and after update - was simply bollocks: microcode: microcode_reload_late: late_cpus: 4 microcode: __reload_late: cpu 2 entered microcode: __reload_late: cpu 1 entered microcode: __reload_late: cpu 3 entered microcode: __reload_late: cpu 0 entered microcode: __reload_late: cpu 1 left microcode: Timeout while waiting for CPUs rendezvous, remaining: 1 CPU1 above would finish, leave and the others will still spin waiting for it to join. So do two synchronization atomics instead, which makes the code a lot more straightforward. Also, since the update is serialized and it also takes quite some time per microcode engine, increase the exit timeout by the number of CPUs on the system. That's ok because the moment all CPUs are done, that timeout will be cut short. Furthermore, panic when some of the CPUs timeout when returning from a microcode update: we can't allow a system with not all cores updated. Also, as an optimization, do not do the exit sync if microcode wasn't updated. Reported-by: Emanuel Czirai <[email protected]> Signed-off-by: Borislav Petkov <[email protected]> Signed-off-by: Thomas Gleixner <[email protected]> Tested-by: Emanuel Czirai <[email protected]> Tested-by: Ashok Raj <[email protected]> Tested-by: Tom Lendacky <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent 2613f36 commit bb8c13d

File tree

1 file changed

+41
-27
lines changed
  • arch/x86/kernel/cpu/microcode

1 file changed

+41
-27
lines changed

arch/x86/kernel/cpu/microcode/core.c

Lines changed: 41 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -517,7 +517,29 @@ static int check_online_cpus(void)
517517
return -EINVAL;
518518
}
519519

520-
static atomic_t late_cpus;
520+
static atomic_t late_cpus_in;
521+
static atomic_t late_cpus_out;
522+
523+
static int __wait_for_cpus(atomic_t *t, long long timeout)
524+
{
525+
int all_cpus = num_online_cpus();
526+
527+
atomic_inc(t);
528+
529+
while (atomic_read(t) < all_cpus) {
530+
if (timeout < SPINUNIT) {
531+
pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
532+
all_cpus - atomic_read(t));
533+
return 1;
534+
}
535+
536+
ndelay(SPINUNIT);
537+
timeout -= SPINUNIT;
538+
539+
touch_nmi_watchdog();
540+
}
541+
return 0;
542+
}
521543

522544
/*
523545
* Returns:
@@ -527,46 +549,39 @@ static atomic_t late_cpus;
527549
*/
528550
static int __reload_late(void *info)
529551
{
530-
unsigned int timeout = NSEC_PER_SEC;
531-
int all_cpus = num_online_cpus();
532552
int cpu = smp_processor_id();
533553
enum ucode_state err;
534554
int ret = 0;
535555

536-
atomic_dec(&late_cpus);
537-
538556
/*
539557
* Wait for all CPUs to arrive. A load will not be attempted unless all
540558
* CPUs show up.
541559
* */
542-
while (atomic_read(&late_cpus)) {
543-
if (timeout < SPINUNIT) {
544-
pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
545-
atomic_read(&late_cpus));
546-
return -1;
547-
}
548-
549-
ndelay(SPINUNIT);
550-
timeout -= SPINUNIT;
551-
552-
touch_nmi_watchdog();
553-
}
560+
if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC))
561+
return -1;
554562

555563
spin_lock(&update_lock);
556564
apply_microcode_local(&err);
557565
spin_unlock(&update_lock);
558566

559567
if (err > UCODE_NFOUND) {
560568
pr_warn("Error reloading microcode on CPU %d\n", cpu);
561-
ret = -1;
562-
} else if (err == UCODE_UPDATED) {
569+
return -1;
570+
/* siblings return UCODE_OK because their engine got updated already */
571+
} else if (err == UCODE_UPDATED || err == UCODE_OK) {
563572
ret = 1;
573+
} else {
574+
return ret;
564575
}
565576

566-
atomic_inc(&late_cpus);
567-
568-
while (atomic_read(&late_cpus) != all_cpus)
569-
cpu_relax();
577+
/*
578+
* Increase the wait timeout to a safe value here since we're
579+
* serializing the microcode update and that could take a while on a
580+
* large number of CPUs. And that is fine as the *actual* timeout will
581+
* be determined by the last CPU finished updating and thus cut short.
582+
*/
583+
if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus()))
584+
panic("Timeout during microcode update!\n");
570585

571586
return ret;
572587
}
@@ -579,12 +594,11 @@ static int microcode_reload_late(void)
579594
{
580595
int ret;
581596

582-
atomic_set(&late_cpus, num_online_cpus());
597+
atomic_set(&late_cpus_in, 0);
598+
atomic_set(&late_cpus_out, 0);
583599

584600
ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
585-
if (ret < 0)
586-
return ret;
587-
else if (ret > 0)
601+
if (ret > 0)
588602
microcode_check();
589603

590604
return ret;

0 commit comments

Comments
 (0)