|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH v8 14/16] x86/microcode: Synchronize late microcode loading
On 01.08.2019 12:22, Chao Gao wrote:
> @@ -234,6 +267,35 @@ bool microcode_update_cache(struct microcode_patch
> *patch)
> }
>
> /*
> + * Wait for a condition to be met with a timeout (us).
> + */
> +static int wait_for_condition(int (*func)(void *data), void *data,
> + unsigned int timeout)
> +{
> + while ( !func(data) )
> + {
> + if ( !timeout-- )
> + {
> + printk("CPU%u: Timeout in %s\n", smp_processor_id(), __func__);
I don't think __func__ is helpful here for problem analysis. Instead
I think you would want to log either func or __builtin_return_address(0).
> @@ -283,37 +345,105 @@ static int microcode_update_cpu(const struct
> microcode_patch *patch)
> return err;
> }
>
> -static long do_microcode_update(void *patch)
> +static int do_microcode_update(void *patch)
> {
> - unsigned int cpu;
> + unsigned int cpu = smp_processor_id();
> + unsigned int cpu_nr = num_online_cpus();
> + int ret;
>
> - /* store the patch after a successful loading */
> - if ( !microcode_update_cpu(patch) && patch )
> + /* Mark loading an ucode is in progress */
> + cmpxchg(&loading_state, LOADING_EXITED, LOADING_ENTERED);
Why cmpxchg(), especially when you don't check whether the store
has actually happened? (Same further down then.)
> + cpumask_set_cpu(cpu, &cpu_callin_map);
> + ret = wait_for_condition(wait_cpu_callin, (void *)(unsigned long)cpu_nr,
> + MICROCODE_CALLIN_TIMEOUT_US);
> + if ( ret )
> {
> - spin_lock(µcode_mutex);
> - microcode_update_cache(patch);
> - spin_unlock(µcode_mutex);
> - patch = NULL;
> + cmpxchg(&loading_state, LOADING_ENTERED, LOADING_ABORTED);
> + return ret;
> }
>
> - if ( microcode_ops->end_update )
> - microcode_ops->end_update();
> + /*
> + * Load microcode update on only one logical processor per core, or in
> + * AMD's term, one core per compute unit. The one with the lowest thread
> + * id among all siblings is chosen to perform the loading.
> + */
> + if ( (cpu == cpumask_first(per_cpu(cpu_sibling_mask, cpu))) )
> + {
> + static unsigned int panicked = 0;
> + bool monitor;
> + unsigned int done;
> + unsigned long tick = 0;
>
> - cpu = cpumask_next(smp_processor_id(), &cpu_online_map);
> - if ( cpu < nr_cpu_ids )
> - return continue_hypercall_on_cpu(cpu, do_microcode_update, patch);
> + ret = microcode_ops->apply_microcode(patch);
> + if ( !ret )
> + {
> + unsigned int cpu2;
>
> - /* Free the patch if no CPU has loaded it successfully. */
> - if ( patch )
> - microcode_free_patch(patch);
> + atomic_inc(&cpu_updated);
> + /* Propagate revision number to all siblings */
> + for_each_cpu(cpu2, per_cpu(cpu_sibling_mask, cpu))
> + per_cpu(cpu_sig, cpu2).rev = this_cpu(cpu_sig).rev;
> + }
>
> - return 0;
> + /*
> + * The first CPU reaching here will monitor the progress and emit
> + * warning message if the duration is too long (e.g. >1 second).
> + */
> + monitor = !atomic_inc_return(&cpu_out);
> + if ( monitor )
> + tick = rdtsc_ordered();
> +
> + /* Waiting for all cores or computing units finishing update */
> + done = atomic_read(&cpu_out);
> + while ( panicked && done != nr_cores )
Don't you mean "!panicked" here, or || instead of && ? Otherwise I don't
see how the loop would ever be entered.
> + {
> + /*
> + * During each timeout interval, at least a CPU is expected to
> + * finish its update. Otherwise, something goes wrong.
> + *
> + * Note that RDTSC (in wait_for_condition()) is safe for threads
> to
> + * execute while waiting for completion of loading an update.
> + */
> + if ( wait_for_condition(&wait_cpu_callout,
> + (void *)(unsigned long)(done + 1),
> + MICROCODE_UPDATE_TIMEOUT_US) &&
> + !cmpxchg(&panicked, 0, 1) )
> + panic("Timeout when finishing updating microcode (finished
> %u/%u)",
> + done, nr_cores);
> +
> + /* Print warning message once if long time is spent here */
> + if ( monitor )
> + {
> + if ( rdtsc_ordered() - tick >= cpu_khz * 1000 )
> + {
> + printk(XENLOG_WARNING "WARNING: UPDATING MICROCODE HAS
> CONSUMED MORE THAN 1 SECOND!\n");
Please save a little bit on line length by wrapping lines before the
initial quote.
> + monitor = false;
> + }
> + }
Please combine both if()-s. And please also consider dropping the
"monitor" local variable - "tick" being non-zero can easily replace
it afaics.
> + done = atomic_read(&cpu_out);
> + }
> +
> + /* Mark loading is done to unblock other threads */
> + loading_state = LOADING_EXITED;
> + }
> + else
> + {
> + while ( loading_state == LOADING_ENTERED )
> + rep_nop();
Instead of the "for_each_cpu(cpu2, per_cpu(cpu_sibling_mask, cpu))"
above, wouldn't it be more logical to have each thread update its
signature here?
Also - do you perhaps mean cpu_relax() instead of rep_nop()?
> @@ -344,19 +481,67 @@ int microcode_update(XEN_GUEST_HANDLE_PARAM(const_void)
> buf, unsigned long len)
> {
> ret = PTR_ERR(patch);
> printk(XENLOG_INFO "Parsing microcode blob error %d\n", ret);
> - goto free;
> + goto put;
> }
>
> if ( !patch )
> {
> printk(XENLOG_INFO "No ucode found. Update aborted!\n");
> ret = -EINVAL;
> - goto free;
> + goto put;
> + }
> +
> + cpumask_clear(&cpu_callin_map);
> + atomic_set(&cpu_out, 0);
> + atomic_set(&cpu_updated, 0);
> + loading_state = LOADING_EXITED;
> +
> + /* Calculate the number of online CPU core */
> + nr_cores = 0;
> + for_each_online_cpu(cpu)
> + if ( cpu == cpumask_first(per_cpu(cpu_sibling_mask, cpu)) )
> + nr_cores++;
> +
> + printk(XENLOG_INFO "%u cores are to update their microcode\n", nr_cores);
> +
> + /*
> + * We intend to disable interrupt for long time, which may lead to
> + * watchdog timeout.
> + */
> + watchdog_disable();
> + /*
> + * Late loading dance. Why the heavy-handed stop_machine effort?
> + *
> + * - HT siblings must be idle and not execute other code while the other
> + * sibling is loading microcode in order to avoid any negative
> + * interactions cause by the loading.
> + *
> + * - In addition, microcode update on the cores must be serialized until
> + * this requirement can be relaxed in the future. Right now, this is
> + * conservative and good.
> + */
> + ret = stop_machine_run(do_microcode_update, patch, NR_CPUS);
> + watchdog_enable();
> +
> + updated = atomic_read(&cpu_updated);
> + if ( updated > 0 )
> + {
> + spin_lock(µcode_mutex);
> + microcode_update_cache(patch);
> + spin_unlock(µcode_mutex);
> }
> + else
> + microcode_free_patch(patch);
>
> - ret = continue_hypercall_on_cpu(cpumask_first(&cpu_online_map),
> - do_microcode_update, patch);
> + if ( updated && updated != nr_cores )
> + printk(XENLOG_ERR
> + "ERROR: Updating microcode succeeded on %u cores and failed
> on\n"
> + "other %u cores. A system with differing microcode revisions
> is\n"
> + "considered unstable. Please reboot and do not load the
> microcode\n"
> + "that triggers this warning!\n", updated, nr_cores - updated);
Note that for such multi-line log messages you need to prefix XENLOG_*
to every one of them.
Jan
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |