|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH 1/2] x86/microcode: Synchronize late microcode loading
From: Gao Chao <chao.gao@xxxxxxxxx>
This patch is to backport microcode improvement patches from linux
kernel. Below are the original patches description:
commit a5321aec6412b20b5ad15db2d6b916c05349dbff
Author: Ashok Raj <ashok.raj@xxxxxxxxx>
Date: Wed Feb 28 11:28:46 2018 +0100
x86/microcode: Synchronize late microcode loading
Original idea by Ashok, completely rewritten by Borislav.
Before you read any further: the early loading method is still the
preferred one and you should always do that. The following patch is
improving the late loading mechanism for long running jobs and cloud use
cases.
Gather all cores and serialize the microcode update on them by doing it
one-by-one to make the late update process as reliable as possible and
avoid potential issues caused by the microcode update.
[ Borislav: Rewrite completely. ]
Co-developed-by: Borislav Petkov <bp@xxxxxxx>
Signed-off-by: Ashok Raj <ashok.raj@xxxxxxxxx>
Signed-off-by: Borislav Petkov <bp@xxxxxxx>
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Tested-by: Tom Lendacky <thomas.lendacky@xxxxxxx>
Tested-by: Ashok Raj <ashok.raj@xxxxxxxxx>
Reviewed-by: Tom Lendacky <thomas.lendacky@xxxxxxx>
Cc: Arjan Van De Ven <arjan.van.de.ven@xxxxxxxxx>
Link: https://lkml.kernel.org/r/20180228102846.13447-8-bp@xxxxxxxxx
commit bb8c13d61a629276a162c1d2b1a20a815cbcfbb7
Author: Borislav Petkov <bp@xxxxxxx>
Date: Wed Mar 14 19:36:15 2018 +0100
x86/microcode: Fix CPU synchronization routine
Emanuel reported an issue with a hang during microcode update because my
dumb idea to use one atomic synchronization variable for both rendezvous
- before and after update - was simply bollocks:
microcode: microcode_reload_late: late_cpus: 4
microcode: __reload_late: cpu 2 entered
microcode: __reload_late: cpu 1 entered
microcode: __reload_late: cpu 3 entered
microcode: __reload_late: cpu 0 entered
microcode: __reload_late: cpu 1 left
microcode: Timeout while waiting for CPUs rendezvous, remaining: 1
CPU1 above would finish, leave and the others will still spin waiting
for
it to join.
So do two synchronization atomics instead, which makes the code a lot
more
straightforward.
Also, since the update is serialized and it also takes quite some time
per
microcode engine, increase the exit timeout by the number of CPUs on the
system.
That's ok because the moment all CPUs are done, that timeout will be cut
short.
Furthermore, panic when some of the CPUs timeout when returning from a
microcode update: we can't allow a system with not all cores updated.
Also, as an optimization, do not do the exit sync if microcode wasn't
updated.
Reported-by: Emanuel Czirai <xftroxgpx@xxxxxxxxxxxxxx>
Signed-off-by: Borislav Petkov <bp@xxxxxxx>
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Tested-by: Emanuel Czirai <xftroxgpx@xxxxxxxxxxxxxx>
Tested-by: Ashok Raj <ashok.raj@xxxxxxxxx>
Tested-by: Tom Lendacky <thomas.lendacky@xxxxxxx>
Link: https://lkml.kernel.org/r/20180314183615.17629-2-bp@xxxxxxxxx
This patch is also in accord with Andrew's suggestion,
"Rendezvous all online cpus in an IPI to apply the patch, and keep the
processors in until all have completed the patch.", in [1].
[1]:https://wiki.xenproject.org/wiki/XenParavirtOps/microcode_update#Run_time_microcode_updates
Signed-off-by: Chao Gao <chao.gao@xxxxxxxxx>
Cc: Kevin Tian <kevin.tian@xxxxxxxxx>
Cc: Jun Nakajima <jun.nakajima@xxxxxxxxx>
Cc: Ashok Raj <ashok.raj@xxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
---
xen/arch/x86/microcode.c | 89 +++++++++++++++++++++++++++++++++++++++---------
1 file changed, 73 insertions(+), 16 deletions(-)
diff --git a/xen/arch/x86/microcode.c b/xen/arch/x86/microcode.c
index 4163f50..94c1ca2 100644
--- a/xen/arch/x86/microcode.c
+++ b/xen/arch/x86/microcode.c
@@ -22,6 +22,7 @@
*/
#include <xen/cpu.h>
+#include <xen/cpumask.h>
#include <xen/lib.h>
#include <xen/kernel.h>
#include <xen/init.h>
@@ -30,15 +31,20 @@
#include <xen/smp.h>
#include <xen/softirq.h>
#include <xen/spinlock.h>
+#include <xen/stop_machine.h>
#include <xen/tasklet.h>
#include <xen/guest_access.h>
#include <xen/earlycpio.h>
+#include <xen/watchdog.h>
+#include <asm/delay.h>
#include <asm/msr.h>
#include <asm/processor.h>
#include <asm/setup.h>
#include <asm/microcode.h>
+#define USEC_PER_SEC 1000000UL
+
static module_t __initdata ucode_mod;
static signed int __initdata ucode_mod_idx;
static bool_t __initdata ucode_mod_forced;
@@ -187,7 +193,7 @@ static DEFINE_SPINLOCK(microcode_mutex);
DEFINE_PER_CPU(struct ucode_cpu_info, ucode_cpu_info);
struct microcode_info {
- unsigned int cpu;
+ atomic_t cpu_in, cpu_out;
uint32_t buffer_size;
int error;
char buffer[1];
@@ -281,24 +287,52 @@ static int microcode_update_cpu(const void *buf, size_t
size)
return err;
}
-static long do_microcode_update(void *_info)
+static int __wait_for_cpus(atomic_t *cnt, int timeout)
{
- struct microcode_info *info = _info;
- int error;
+ int cpus = num_online_cpus();
- BUG_ON(info->cpu != smp_processor_id());
+ atomic_inc(cnt);
- error = microcode_update_cpu(info->buffer, info->buffer_size);
- if ( error )
- info->error = error;
+ while (atomic_read(cnt) != cpus)
+ {
+ if ( timeout <= 0 )
+ {
+ printk("Timeout when waiting for CPUs calling in\n");
+ return -1;
+ }
+ udelay(1);
+ timeout--;
+ }
- info->cpu = cpumask_next(info->cpu, &cpu_online_map);
- if ( info->cpu < nr_cpu_ids )
- return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
+ return 0;
+}
- error = info->error;
- xfree(info);
- return error;
+static int do_microcode_update(void *_info)
+{
+ struct microcode_info *info = _info;
+ int error, ret = 0;
+
+ error = __wait_for_cpus(&info->cpu_in, USEC_PER_SEC);
+ if ( error )
+ {
+ ret = -EBUSY;
+ return ret;
+ }
+
+ error = microcode_update_cpu(info->buffer, info->buffer_size);
+ if ( error && !ret )
+ ret = error;
+ /*
+ * Increase the wait timeout to a safe value here since we're serializing
+ * the microcode update and that could take a while on a large number of
+ * CPUs. And that is fine as the *actual* timeout will be determined by
+ * the last CPU finished updating and thus cut short
+ */
+ error = __wait_for_cpus(&info->cpu_out, USEC_PER_SEC * num_online_cpus());
+ if (error)
+ panic("Timeout when finishing updating microcode");
+ info->error = ret;
+ return ret;
}
int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void) buf, unsigned long len)
@@ -325,7 +359,6 @@ int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void)
buf, unsigned long len)
info->buffer_size = len;
info->error = 0;
- info->cpu = cpumask_first(&cpu_online_map);
if ( microcode_ops->start_update )
{
@@ -337,7 +370,31 @@ int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void)
buf, unsigned long len)
}
}
- return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
+ atomic_set(&info->cpu_in, 0);
+ atomic_set(&info->cpu_out, 0);
+
+ /*
+ * We intend to disable interrupt for long time, which may lead to
+ * watchdog timeout. Disable watchdog temporally.
+ */
+ watchdog_disable();
+ /*
+ * Late loading dance. Why the heavy-handed stop_machine effort?
+ *
+ * -HT siblings must be idle and not execute other code while the other
+ * sibling is loading microcode in order to avoid any negative
+ * interactions cause by the loading.
+ *
+ * -In addition, microcode update on the cores must be serialized until
+ * this requirement can be relaxed in the feature. Right now, this is
+ * conservative and good.
+ */
+ stop_machine_run(do_microcode_update, info, NR_CPUS);
+ watchdog_enable();
+
+ ret = info->error;
+ xfree(info);
+ return ret;
}
static int __init microcode_init(void)
--
1.8.3.1
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |