|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCH v3] x86: use VMLOAD for PV context switch
On 18/09/18 13:28, Jan Beulich wrote:
> Besides the mentioned oddity with measured performance, I've also
> noticed a significant difference (of at least 150 clocks) between
> measuring immediately around the calls to svm_load_segs() and measuring
> immediately inside the function.
This is a little concerning. It either means that there is a bug with
your timing, or (along the same line as why I think the prefetch makes a
difference), the mapping of of svm_load_segs() is reliably TLB-cold in
the context switch path.
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -52,6 +52,7 @@
> #include <asm/hvm/hvm.h>
> #include <asm/hvm/nestedhvm.h>
> #include <asm/hvm/support.h>
> +#include <asm/hvm/svm/svm.h>
> #include <asm/hvm/viridian.h>
> #include <asm/debugreg.h>
> #include <asm/msr.h>
> @@ -1281,11 +1282,35 @@ static void load_segments(struct vcpu *n
> struct cpu_user_regs *uregs = &n->arch.user_regs;
> int all_segs_okay = 1;
> unsigned int dirty_segment_mask, cpu = smp_processor_id();
> + bool fs_gs_done = false;
>
> /* Load and clear the dirty segment mask. */
> dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
> per_cpu(dirty_segment_mask, cpu) = 0;
>
> +#ifdef CONFIG_HVM
> + if ( !is_pv_32bit_vcpu(n) && !cpu_has_fsgsbase && cpu_has_svm &&
> + !((uregs->fs | uregs->gs) & ~3) &&
> + /*
> + * The remaining part is just for optimization: If only shadow GS
> + * needs loading, there's nothing to be gained here.
VMLOAD also loads LDT, and LLDT is fully serialising, so an even heavier
perf hit than wrmsr.
> + */
> + (n->arch.pv.fs_base | n->arch.pv.gs_base_user) )
> + {
> + fs_gs_done = n->arch.flags & TF_kernel_mode
> + ? svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
> + uregs->fs, n->arch.pv.fs_base,
> + uregs->gs, n->arch.pv.gs_base_kernel,
> + n->arch.pv.gs_base_user)
> + : svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
> + uregs->fs, n->arch.pv.fs_base,
> + uregs->gs, n->arch.pv.gs_base_user,
> + n->arch.pv.gs_base_kernel);
This looks like a confusing way of writing:
{
unsigned long gsb = n->arch.flags & TF_kernel_mode
? n->arch.pv.gs_base_kernel : n->arch.pv.gs_base_user;
unsigned long gss = n->arch.flags & TF_kernel_mode
? n->arch.pv.gs_base_user : n->arch.pv.gs_base_kernel;
fs_gs_done = svm_load_segs(n->arch.pv.ldt_ents, LDT_VIRT_START(n),
uregs->fs, n->arch.pv.fs_base,
uregs->gs, gsb, gss);
}
AFAICT?
> + }
> +#endif
> + if ( !fs_gs_done )
> + load_LDT(n);
> +
> /* Either selector != 0 ==> reload. */
> if ( unlikely((dirty_segment_mask & DIRTY_DS) | uregs->ds) )
> {
> @@ -1301,7 +1326,7 @@ static void load_segments(struct vcpu *n
> }
>
> /* Either selector != 0 ==> reload. */
> - if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) )
> + if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) &&
> !fs_gs_done )
> {
> all_segs_okay &= loadsegment(fs, uregs->fs);
> /* non-nul selector updates fs_base */
> @@ -1310,7 +1335,7 @@ static void load_segments(struct vcpu *n
> }
>
> /* Either selector != 0 ==> reload. */
> - if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) )
> + if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) &&
> !fs_gs_done )
One too many spaces.
> {
> all_segs_okay &= loadsegment(gs, uregs->gs);
> /* non-nul selector updates gs_base_user */
> @@ -1318,7 +1343,7 @@ static void load_segments(struct vcpu *n
> dirty_segment_mask &= ~DIRTY_GS_BASE;
> }
>
> - if ( !is_pv_32bit_vcpu(n) )
> + if ( !fs_gs_done && !is_pv_32bit_vcpu(n) )
> {
> /* This can only be non-zero if selector is NULL. */
> if ( n->arch.pv.fs_base | (dirty_segment_mask & DIRTY_FS_BASE) )
> @@ -1653,6 +1678,12 @@ static void __context_switch(void)
>
> write_ptbase(n);
>
> +#if defined(CONFIG_PV) && defined(CONFIG_HVM)
From a comments in code point of view, this is the most useful location
to have something along the lines of:
/* Prefetch the VMCB if we expect to use it later in the context switch */
> + if ( is_pv_domain(nd) && !is_pv_32bit_domain(nd) && !is_idle_domain(nd)
> &&
> + !cpu_has_fsgsbase && cpu_has_svm )
> + svm_load_segs(0, 0, 0, 0, 0, 0, 0);
> +#endif
> +
> if ( need_full_gdt(nd) &&
> ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(pd)) )
> {
> --- a/xen/arch/x86/hvm/svm/svm.c
> +++ b/xen/arch/x86/hvm/svm/svm.c
> @@ -1630,6 +1646,66 @@ static void svm_init_erratum_383(const s
> }
> }
>
> +#ifdef CONFIG_PV
> +bool svm_load_segs(unsigned int ldt_ents, unsigned long ldt_base,
> + unsigned int fs_sel, unsigned long fs_base,
> + unsigned int gs_sel, unsigned long gs_base,
> + unsigned long gs_shadow)
> +{
> + unsigned int cpu = smp_processor_id();
> + struct vmcb_struct *vmcb = per_cpu(host_vmcb_va, cpu);
> +
> + if ( unlikely(!vmcb) )
> + return false;
When can this error path ever be taken?
> +
> + if ( !ldt_base )
> + {
> + /*
> + * The actual structure field used here was arbitrarily chosen.
> + * Empirically it doesn't seem to matter much which element is used,
> + * and a clear explanation of the otherwise poor performance has not
> + * been found/provided so far.
> + */
> + asm volatile ( "prefetch %0" :: "m" (vmcb->ldtr) );
prefetchw(), which already exists and is used.
~Andrew
> + return true;
> + }
> +
> + if ( likely(!ldt_ents) )
> + memset(&vmcb->ldtr, 0, sizeof(vmcb->ldtr));
> + else
> + {
> + /* Keep GDT in sync. */
> + struct desc_struct *desc = this_cpu(gdt_table) + LDT_ENTRY -
> + FIRST_RESERVED_GDT_ENTRY;
> +
> + _set_tssldt_desc(desc, ldt_base, ldt_ents * 8 - 1, SYS_DESC_ldt);
> +
> + vmcb->ldtr.sel = LDT_ENTRY << 3;
> + vmcb->ldtr.attr = SYS_DESC_ldt | (_SEGMENT_P >> 8);
> + vmcb->ldtr.limit = ldt_ents * 8 - 1;
> + vmcb->ldtr.base = ldt_base;
> + }
> +
> + ASSERT(!(fs_sel & ~3));
> + vmcb->fs.sel = fs_sel;
> + vmcb->fs.attr = 0;
> + vmcb->fs.limit = 0;
> + vmcb->fs.base = fs_base;
> +
> + ASSERT(!(gs_sel & ~3));
> + vmcb->gs.sel = gs_sel;
> + vmcb->gs.attr = 0;
> + vmcb->gs.limit = 0;
> + vmcb->gs.base = gs_base;
> +
> + vmcb->kerngsbase = gs_shadow;
> +
> + svm_vmload_pa(per_cpu(host_vmcb, cpu));
> +
> + return true;
> +}
> +#endif
> +
> static int _svm_cpu_up(bool bsp)
> {
> uint64_t msr_content;
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |