Xen project Mailing List

Re: [Xen-devel] [PATCH v5 1/7] x86/xpti: avoid copying L4 page table contents when possible

To: Juergen Gross <jgross@xxxxxxxx>, <xen-devel@xxxxxxxxxxxxxxxxxxxx>

From: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>

Date: Fri, 6 Apr 2018 12:09:58 +0100

Delivery-date: Fri, 06 Apr 2018 11:10:18 +0000

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

Openpgp: preference=signencrypt

On 06/04/18 08:52, Juergen Gross wrote: > For mitigation of Meltdown the current L4 page table is copied to the > cpu local root page table each time a 64 bit pv guest is entered. > > Copying can be avoided in cases where the guest L4 page table hasn't > been modified while running the hypervisor, e.g. when handling > interrupts or any hypercall not modifying the L4 page table or %cr3. > > So add a per-cpu flag whether the copying should be performed and set "flag indicating whether" > that flag only when loading a new %cr3 or modifying the L4 page table. > This includes synchronization of the cpu local root page table with > other cpus, so add a special synchronization flag for that case. > > A simple performance check (compiling the hypervisor via "make -j 4") > in dom0 with 4 vcpus shows a significant improvement: > > - real time drops from 113 seconds to 109 seconds > - system time drops from 165 seconds to 155 seconds > > Signed-off-by: Juergen Gross <jgross@xxxxxxxx> > Reviewed-by: Jan Beulich <jbeulich@xxxxxxxx> > --- > V4: > - move setting of root_pgt_changed flag in flush_area_local() out of > irq disabled section (Jan Beulich) > - move setting of root_pgt_changed in make_cr3() to _toggle_guest_pt() > (Jan Beulich) > - remove most conditionals in write_ptbase() (Jan Beulich) > - don't set root_pgt_changed in do_mmu_update() for modification of > the user page table (Jan Beulich) > > V3: > - set flag locally only if affected L4 is active (Jan Beulich) > - add setting flag to flush_area_mask() (Jan Beulich) > - set flag in make_cr3() only if called for current active vcpu > > To be applied on top of Jan's "Meltdown band-aid overhead reduction" > series > --- > xen/arch/x86/flushtlb.c | 3 +++ > xen/arch/x86/mm.c | 38 +++++++++++++++++++++++++------------- > xen/arch/x86/pv/domain.c | 2 ++ > xen/arch/x86/smp.c | 2 +- > xen/arch/x86/x86_64/asm-offsets.c | 1 + > xen/arch/x86/x86_64/entry.S | 8 ++++++-- > xen/include/asm-x86/current.h | 8 ++++++++ > xen/include/asm-x86/flushtlb.h | 2 ++ > 8 files changed, 48 insertions(+), 16 deletions(-) > > diff --git a/xen/arch/x86/flushtlb.c b/xen/arch/x86/flushtlb.c > index 8a7a76b8ff..38cedf3b22 100644 > --- a/xen/arch/x86/flushtlb.c > +++ b/xen/arch/x86/flushtlb.c > @@ -160,5 +160,8 @@ unsigned int flush_area_local(const void *va, unsigned > int flags) > > local_irq_restore(irqfl); > > + if ( flags & FLUSH_ROOT_PGTBL ) > + get_cpu_info()->root_pgt_changed = true; > + > return flags; > } > diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c > index 6d39d2c8ab..fd89685486 100644 > --- a/xen/arch/x86/mm.c > +++ b/xen/arch/x86/mm.c > @@ -504,6 +504,8 @@ void make_cr3(struct vcpu *v, mfn_t mfn) > > void write_ptbase(struct vcpu *v) > { > + if ( this_cpu(root_pgt) ) I'm not sure this conditional is wise. A call hitting here has changed the root pgt, irrespective of whether the exit path cares. > + get_cpu_info()->root_pgt_changed = true; > write_cr3(v->arch.cr3); > } > > @@ -3701,18 +3703,28 @@ long do_mmu_update( > break; > rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn, > cmd == MMU_PT_UPDATE_PRESERVE_AD, v); > - /* > - * No need to sync if all uses of the page can be > accounted > - * to the page lock we hold, its pinned status, and uses > on > - * this (v)CPU. > - */ > - if ( !rc && !cpu_has_no_xpti && > - ((page->u.inuse.type_info & PGT_count_mask) > > - (1 + !!(page->u.inuse.type_info & PGT_pinned) + > - (pagetable_get_pfn(curr->arch.guest_table) == > mfn) + > - (pagetable_get_pfn(curr->arch.guest_table_user) == > - mfn))) ) > - sync_guest = true; > + if ( !rc && !cpu_has_no_xpti ) > + { > + bool local_in_use = false; > + > + if ( pagetable_get_pfn(curr->arch.guest_table) == > mfn ) > + { > + local_in_use = true; > + get_cpu_info()->root_pgt_changed = true; > + } > + > + /* > + * No need to sync if all uses of the page can be > + * accounted to the page lock we hold, its pinned > + * status, and uses on this (v)CPU. > + */ > + if ( (page->u.inuse.type_info & PGT_count_mask) > > + (1 + !!(page->u.inuse.type_info & PGT_pinned) + > + > (pagetable_get_pfn(curr->arch.guest_table_user) == > + mfn) + > + local_in_use) ) These two lines can be folded together. > + sync_guest = true; > + } > break; > > case PGT_writable_page: > @@ -3827,7 +3839,7 @@ long do_mmu_update( > > cpumask_andnot(mask, pt_owner->dirty_cpumask, cpumask_of(cpu)); > if ( !cpumask_empty(mask) ) > - flush_mask(mask, FLUSH_TLB_GLOBAL); > + flush_mask(mask, FLUSH_TLB_GLOBAL | FLUSH_ROOT_PGTBL); > } > > perfc_add(num_page_updates, i); > diff --git a/xen/arch/x86/pv/domain.c b/xen/arch/x86/pv/domain.c > index 01c62e2d45..42522a2db3 100644 > --- a/xen/arch/x86/pv/domain.c > +++ b/xen/arch/x86/pv/domain.c > @@ -223,6 +223,8 @@ static void _toggle_guest_pt(struct vcpu *v) > { > v->arch.flags ^= TF_kernel_mode; > update_cr3(v); > + get_cpu_info()->root_pgt_changed = true; > + > /* Don't flush user global mappings from the TLB. Don't tick TLB clock. > */ > asm volatile ( "mov %0, %%cr3" : : "r" (v->arch.cr3) : "memory" ); > > diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c > index 033dd05958..63e819ca38 100644 > --- a/xen/arch/x86/smp.c > +++ b/xen/arch/x86/smp.c > @@ -208,7 +208,7 @@ void invalidate_interrupt(struct cpu_user_regs *regs) > ack_APIC_irq(); > perfc_incr(ipis); > if ( (flags & FLUSH_VCPU_STATE) && __sync_local_execstate() ) > - flags &= ~(FLUSH_TLB | FLUSH_TLB_GLOBAL); > + flags &= ~(FLUSH_TLB | FLUSH_TLB_GLOBAL | FLUSH_ROOT_PGTBL); > if ( flags & ~(FLUSH_VCPU_STATE | FLUSH_ORDER_MASK) ) > flush_area_local(flush_va, flags); > cpumask_clear_cpu(smp_processor_id(), &flush_cpumask); > diff --git a/xen/arch/x86/x86_64/asm-offsets.c > b/xen/arch/x86/x86_64/asm-offsets.c > index a2fea94f4c..9e2aefb00f 100644 > --- a/xen/arch/x86/x86_64/asm-offsets.c > +++ b/xen/arch/x86/x86_64/asm-offsets.c > @@ -143,6 +143,7 @@ void __dummy__(void) > OFFSET(CPUINFO_shadow_spec_ctrl, struct cpu_info, shadow_spec_ctrl); > OFFSET(CPUINFO_use_shadow_spec_ctrl, struct cpu_info, > use_shadow_spec_ctrl); > OFFSET(CPUINFO_bti_ist_info, struct cpu_info, bti_ist_info); > + OFFSET(CPUINFO_root_pgt_changed, struct cpu_info, root_pgt_changed); > DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info)); > BLANK(); > > diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S > index 45d9842d09..30c9da5446 100644 > --- a/xen/arch/x86/x86_64/entry.S > +++ b/xen/arch/x86/x86_64/entry.S > @@ -128,10 +128,13 @@ restore_all_guest: > /* Copy guest mappings and switch to per-CPU root page table. */ > mov VCPU_cr3(%rbx), %r9 > GET_STACK_END(dx) > - mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi > + mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rax I think this assembly block is correct (based on the register requirements at .Lrag_copy_done). However (as a check of whether I've understood it correctly), I think it would be more obviously correct as: mov STACK_CPUINFO_FIELD(pv_cr3)(%rdx), %rdi (no delta) mov %rdi, %rax (move from lower down) cmpb $0, STACK_CPUINFO_FIELD(root_pgt_changed)(%rdx) and (IIRC) is slightly better static instruction scheduling for in-order processors. ~Andrew > + cmpb $0, STACK_CPUINFO_FIELD(root_pgt_changed)(%rdx) > + je .Lrag_copy_done > + movb $0, STACK_CPUINFO_FIELD(root_pgt_changed)(%rdx) > movabs $PADDR_MASK & PAGE_MASK, %rsi > movabs $DIRECTMAP_VIRT_START, %rcx > - mov %rdi, %rax > + mov %rax, %rdi > and %rsi, %rdi > jz .Lrag_keep_cr3 > and %r9, %rsi > @@ -148,6 +151,7 @@ restore_all_guest: > sub $(ROOT_PAGETABLE_FIRST_XEN_SLOT - \ > ROOT_PAGETABLE_LAST_XEN_SLOT - 1) * 8, %rdi > rep movsq > +.Lrag_copy_done: > mov STACK_CPUINFO_FIELD(cr4)(%rdx), %rdi > mov %r9, STACK_CPUINFO_FIELD(xen_cr3)(%rdx) > mov %rdi, %rsi > diff --git a/xen/include/asm-x86/current.h b/xen/include/asm-x86/current.h > index 3a0e1eef36..f2491b4423 100644 > --- a/xen/include/asm-x86/current.h > +++ b/xen/include/asm-x86/current.h > @@ -59,6 +59,14 @@ struct cpu_info { > bool use_shadow_spec_ctrl; > uint8_t bti_ist_info; > > + /* > + * The following field controls copying of the L4 page table of 64-bit > + * PV guests to the per-cpu root page table on entering the guest > context. > + * If set the L4 page table is being copied to the root page table and > + * the field will be reset. > + */ > + bool root_pgt_changed; > + > unsigned long __pad; > /* get_stack_bottom() must be 16-byte aligned */ > }; > diff --git a/xen/include/asm-x86/flushtlb.h b/xen/include/asm-x86/flushtlb.h > index 2cade9cbfb..052f0fa403 100644 > --- a/xen/include/asm-x86/flushtlb.h > +++ b/xen/include/asm-x86/flushtlb.h > @@ -103,6 +103,8 @@ void write_cr3(unsigned long cr3); > #define FLUSH_VA_VALID 0x800 > /* Flush CPU state */ > #define FLUSH_VCPU_STATE 0x1000 > + /* Flush the per-cpu root page table */ > +#define FLUSH_ROOT_PGTBL 0x2000 > > /* Flush local TLBs/caches. */ > unsigned int flush_area_local(const void *va, unsigned int flags); _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.