|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH v4 6/6] x86/microcode: Synchronize late microcode loading
On Wed, Nov 28, 2018 at 01:34:16PM +0800, Chao Gao wrote:
> This patch ports microcode improvement patches from linux kernel.
>
> Before you read any further: the early loading method is still the
> preferred one and you should always do that. The following patch is
> improving the late loading mechanism for long running jobs and cloud use
> cases.
>
> Gather all cores and serialize the microcode update on them by doing it
> one-by-one to make the late update process as reliable as possible and
> avoid potential issues caused by the microcode update.
>
> Signed-off-by: Chao Gao <chao.gao@xxxxxxxxx>
> Tested-by: Chao Gao <chao.gao@xxxxxxxxx>
> [linux commit: a5321aec6412b20b5ad15db2d6b916c05349dbff]
> [linux commit: bb8c13d61a629276a162c1d2b1a20a815cbcfbb7]
If this patch is the squash of two Linux commits, please post the
ported versions of the two commits separately.
> Cc: Kevin Tian <kevin.tian@xxxxxxxxx>
> Cc: Jun Nakajima <jun.nakajima@xxxxxxxxx>
> Cc: Ashok Raj <ashok.raj@xxxxxxxxx>
> Cc: Borislav Petkov <bp@xxxxxxx>
> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
> Cc: Jan Beulich <jbeulich@xxxxxxxx>
> ---
> xen/arch/x86/microcode.c | 123
> +++++++++++++++++++++++++++++++++++++----------
> 1 file changed, 97 insertions(+), 26 deletions(-)
>
> diff --git a/xen/arch/x86/microcode.c b/xen/arch/x86/microcode.c
> index 0b435f4..d5a2a94 100644
> --- a/xen/arch/x86/microcode.c
> +++ b/xen/arch/x86/microcode.c
> @@ -22,6 +22,7 @@
> */
>
> #include <xen/cpu.h>
> +#include <xen/cpumask.h>
> #include <xen/lib.h>
> #include <xen/kernel.h>
> #include <xen/init.h>
> @@ -30,18 +31,25 @@
> #include <xen/smp.h>
> #include <xen/softirq.h>
> #include <xen/spinlock.h>
> +#include <xen/stop_machine.h>
> #include <xen/tasklet.h>
> #include <xen/guest_access.h>
> #include <xen/earlycpio.h>
> +#include <xen/watchdog.h>
>
> +#include <asm/delay.h>
> #include <asm/msr.h>
> #include <asm/processor.h>
> #include <asm/setup.h>
> #include <asm/microcode.h>
>
> +/* By default, wait for 30000us */
> +#define MICROCODE_DEFAULT_TIMEOUT_US 30000
> +
> static module_t __initdata ucode_mod;
> static signed int __initdata ucode_mod_idx;
> static bool_t __initdata ucode_mod_forced;
> +static unsigned int nr_cores;
>
> /*
> * If we scan the initramfs.cpio for the early microcode code
> @@ -189,8 +197,7 @@ static DEFINE_SPINLOCK(microcode_mutex);
> DEFINE_PER_CPU(struct ucode_cpu_info, ucode_cpu_info);
>
> struct microcode_info {
> - unsigned int cpu;
> - int error;
> + atomic_t cpu_in, cpu_out;
Can you make this variables global to the file and just remove
microcode_info?
> };
>
> static void __microcode_fini_cpu(unsigned int cpu)
> @@ -242,31 +249,62 @@ static int microcode_update_cpu(void)
> return err;
> }
>
> -static long do_microcode_update(void *_info)
> +/* Wait for all CPUs to rendezvous with a timeout (us) */
> +static int wait_for_cpus(atomic_t *cnt, unsigned int timeout)
> {
> - struct microcode_info *info = _info;
> - int error;
> + unsigned int cpus = num_online_cpus();
>
> - BUG_ON(info->cpu != smp_processor_id());
> + atomic_inc(cnt);
>
> - error = microcode_update_cpu();
> - if ( error )
> - info->error = error;
> + while ( atomic_read(cnt) != cpus )
> + {
> + if ( timeout <= 0 )
> + {
> + printk("Timeout when waiting for CPUs calling in\n");
> + return -EBUSY;
> + }
> + udelay(1);
> + timeout--;
> + }
>
> - info->cpu = cpumask_next(info->cpu, &cpu_online_map);
> - if ( info->cpu < nr_cpu_ids )
> - return continue_hypercall_on_cpu(info->cpu, do_microcode_update,
> info);
> + return 0;
> +}
>
> - error = info->error;
> - xfree(info);
> - return error;
> +static int do_microcode_update(void *_info)
> +{
> + struct microcode_info *info = _info;
> + unsigned int cpu = smp_processor_id();
> + int ret;
> +
> + ret = wait_for_cpus(&info->cpu_in, MICROCODE_DEFAULT_TIMEOUT_US);
> + if ( ret )
> + return ret;
> +
> + /*
> + * Initiate an update on all processors which don't have an online
> sibling
> + * thread with a lower thread id. Other sibling threads just await the
> + * completion of microcode update.
> + */
> + if ( cpu == cpumask_first(per_cpu(cpu_sibling_mask, cpu)) )
> + ret = microcode_update_cpu();
> + /*
> + * Increase the wait timeout to a safe value here since we're serializing
> + * the microcode update and that could take a while on a large number of
> + * CPUs. And that is fine as the *actual* timeout will be determined by
> + * the last CPU finished updating and thus cut short
> + */
> + if ( wait_for_cpus(&info->cpu_out, MICROCODE_DEFAULT_TIMEOUT_US *
> + nr_cores) )
Isn't this likely to trigger the watchdog on big systems? Oh I see
below that you disable the watchdog.
> + panic("Timeout when finishing updating microcode");
> +
> + return ret;
> }
>
> int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void) buf, unsigned long
> len)
> {
> int ret;
> - struct microcode_info *info;
> unsigned int cpu = smp_processor_id();
> + struct microcode_info *info;
> struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
> void * buffer;
>
> @@ -283,19 +321,20 @@ int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void)
> buf, unsigned long len)
>
> ret = copy_from_guest(buffer, buf, len);
> if ( ret != 0 )
> + goto free;
> +
> + /* cpu_online_map must not change during update */
> + if ( !get_cpu_maps() )
> {
> - xfree(info);
> - return ret;
> + ret = -EBUSY;
> + goto free;
> }
>
> if ( microcode_ops->start_update )
> {
> ret = microcode_ops->start_update();
> if ( ret != 0 )
> - {
> - xfree(info);
> - return ret;
> - }
> + goto put;
> }
>
> spin_lock(µcode_mutex);
> @@ -311,13 +350,45 @@ int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void)
> buf, unsigned long len)
> if ( ret <= 0 )
> {
> printk("No valid or newer microcode found. Update abort!\n");
> - return -EINVAL;
> + ret = -EINVAL;
> + goto put;
> }
>
> - info->error = 0;
> - info->cpu = cpumask_first(&cpu_online_map);
> + atomic_set(&info->cpu_in, 0);
> + atomic_set(&info->cpu_out, 0);
> +
> + /* Calculate the number of online CPU core */
> + nr_cores = 0;
> + for_each_online_cpu(cpu)
> + if ( cpu == cpumask_first(per_cpu(cpu_sibling_mask, cpu)) )
> + nr_cores++;
> +
> + printk("%d cores are to update its microcode\n", nr_cores);
>
> - return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
> + /*
> + * We intend to disable interrupt for long time, which may lead to
> + * watchdog timeout.
> + */
> + watchdog_disable();
> + /*
> + * Late loading dance. Why the heavy-handed stop_machine effort?
> + *
> + * - HT siblings must be idle and not execute other code while the other
> + * sibling is loading microcode in order to avoid any negative
> + * interactions cause by the loading.
Well, the HT siblings will be executing code, since they are in a
while loop waiting for the non-siblings cores to finish updating.
Thanks, Roger.
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |