Xen project Mailing List

[Xen-devel] [PATCH RFC 26/44] x86/pt-shadow: Maintain a small cache of shadowed frames

From: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>

Date: Thu, 4 Jan 2018 20:21:51 +0000

Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>

Delivery-date: Thu, 04 Jan 2018 20:28:41 +0000

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

This improves the shadowing performance substantially. In particular, system calls for 64bit PV guests (which switch between the user and kernel pagetables) no longer suffer a 4K copy hit in both directions. See the code comments for reasoning and the algorithm description. Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> --- xen/arch/x86/mm.c | 2 + xen/arch/x86/mm/shadow/multi.c | 2 + xen/arch/x86/pv/pt-shadow.c | 196 ++++++++++++++++++++++++++++++++----- xen/include/asm-x86/pv/pt-shadow.h | 9 ++ 4 files changed, 186 insertions(+), 23 deletions(-) diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index d5c69c0..f8f15e9 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -2413,6 +2413,8 @@ int free_page_type(struct page_info *page, unsigned long type, case PGT_l4_page_table: ASSERT(preemptible); rc = free_l4_table(page); + if ( !rc ) + pt_shadow_l4_invlpg(owner, page); break; default: gdprintk(XENLOG_WARNING, "type %" PRtype_info " mfn %" PRI_mfn "\n", diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c index 9c929ed..f9ec5aa 100644 --- a/xen/arch/x86/mm/shadow/multi.c +++ b/xen/arch/x86/mm/shadow/multi.c @@ -1895,6 +1895,8 @@ void sh_destroy_l4_shadow(struct domain *d, mfn_t smfn) } }); + pt_shadow_l4_invlpg(d, sp); + /* Put the memory back in the pool */ shadow_free(d, smfn); } diff --git a/xen/arch/x86/pv/pt-shadow.c b/xen/arch/x86/pv/pt-shadow.c index 33cb303..b4f2b86 100644 --- a/xen/arch/x86/pv/pt-shadow.c +++ b/xen/arch/x86/pv/pt-shadow.c @@ -24,6 +24,10 @@ #include <asm/pv/pt-shadow.h> +/* Override macros from asm/mm.h to make them work with mfn_t */ +#undef page_to_mfn +#define page_to_mfn(pg) _mfn(__page_to_mfn(pg)) + /* * To use percpu linear ranges, we require that no two pcpus have %cr3 * pointing at the same L4 pagetable at the same time. @@ -38,19 +42,44 @@ * * The algorithm is fairly simple. * + * - A small cache of shadowed L4s from the same guest is maintained. * - When a pcpu is switching to a new vcpu cr3 and shadowing is necessary, - * perform a full 4K copy of the guests frame into a percpu frame, and run - * on that. + * the cache is searched. + * - If the new cr3 is already cached, use our existing shadow. + * - If not, drop an entry and shadow the new frame with a full 4K copy. * - When a write to a guests L4 pagetable occurs, the update must be * propagated to all existing shadows. An IPI is sent to the domains * dirty mask indicating which frame/slot was updated, and each pcpu * checks to see whether it needs to sync the update into its shadow. + * - When a guest L4 pagetable is freed, it must be dropped from any caches, + * as Xen will allow it to become writeable to the guest again, and its + * contents will go stale. It uses the same IPI mechanism as for writes. + */ + +#define L4_SHADOW_ORDER 2 +#define NR_L4_SHADOWS (1ul << L4_SHADOW_ORDER) + +/* + * An individual cache entry. Contains a %cr3 which has been cached, and the + * index of this entry into the shadow frames. + * + * The layout relies on %cr3 being page aligned, with the index stored in the + * lower bits. idx could be a smaller bitfield, but there is no other + * information to store, and having it as an 8bit field results in better + * compiled code. */ +typedef union pt_cache_entry { + unsigned long raw; + struct { + uint8_t idx; + unsigned long :4, cr3_mfn:52; + }; +} pt_cache_entry_t; struct pt_shadow { /* - * A frame used to shadow a vcpus intended pagetable. When shadowing, - * this frame is the one actually referenced by %cr3. + * A cache of frames used to shadow a vcpus intended pagetables. When + * shadowing, one of these frames is the one actually referenced by %cr3. */ paddr_t shadow_l4; l4_pgentry_t *shadow_l4_va; @@ -63,29 +92,60 @@ struct pt_shadow { */ const struct domain *domain; - /* If nonzero, a guests pagetable which we are shadowing. */ - paddr_t shadowing; + /* + * A collection of %cr3's, belonging to @p domain, which are shadowed + * locally. + * + * A cache entry is used if cr3_mfn != 0, free otherwise. The cache is + * maintained in most-recently-used order. As a result, cache[0].cr3_mfn + * should always match v->arch.cr3. + * + * The cache[].idx fields will always be unique, and between 0 and + * NR_L4_SHADOWS. Their order however will vary as most-recently-used + * order is maintained. + */ + pt_cache_entry_t cache[NR_L4_SHADOWS]; }; static DEFINE_PER_CPU(struct pt_shadow, ptsh); +static l4_pgentry_t *shadow_l4_va(struct pt_shadow *ptsh, unsigned int idx) +{ + return _p(ptsh->shadow_l4_va) + idx * PAGE_SIZE; +} + +static paddr_t shadow_l4(struct pt_shadow *ptsh, unsigned int idx) +{ + return ptsh->shadow_l4 + idx * PAGE_SIZE; +} + int pt_shadow_alloc(unsigned int cpu) { struct pt_shadow *ptsh = &per_cpu(ptsh, cpu); - unsigned int memflags = 0; + unsigned int memflags = 0, i; nodeid_t node = cpu_to_node(cpu); struct page_info *pg; + mfn_t mfns[NR_L4_SHADOWS]; if ( node != NUMA_NO_NODE ) memflags = MEMF_node(node); - pg = alloc_domheap_page(NULL, memflags); + pg = alloc_domheap_pages(NULL, L4_SHADOW_ORDER, memflags); if ( !pg ) return -ENOMEM; ptsh->shadow_l4 = page_to_maddr(pg); - ptsh->shadow_l4_va = __map_domain_page_global(pg); + for ( i = 0; i < ARRAY_SIZE(mfns); ++i ) + { + /* Initialise the cache (ascending idx fields). */ + ptsh->cache[i] = (pt_cache_entry_t){ i }; + + /* Collect MFNs to vmap(). */ + mfns[i] = mfn_add(maddr_to_mfn(ptsh->shadow_l4), i); + } + + ptsh->shadow_l4_va = vmap(mfns, ARRAY_SIZE(mfns)); if ( !ptsh->shadow_l4_va ) return -ENOMEM; @@ -98,17 +158,35 @@ void pt_shadow_free(unsigned int cpu) if ( ptsh->shadow_l4_va ) { - unmap_domain_page_global(ptsh->shadow_l4_va); + vunmap(ptsh->shadow_l4_va); ptsh->shadow_l4_va = NULL; } if ( ptsh->shadow_l4 ) { - free_domheap_page(maddr_to_page(ptsh->shadow_l4)); + free_domheap_pages(maddr_to_page(ptsh->shadow_l4), L4_SHADOW_ORDER); ptsh->shadow_l4 = 0; } } +static pt_cache_entry_t *pt_cache_lookup( + struct pt_shadow *ptsh, unsigned long maddr) +{ + unsigned int i; + + ASSERT(!local_irq_is_enabled()); + + for ( i = 0; i < ARRAY_SIZE(ptsh->cache); ++i ) + { + pt_cache_entry_t *ent = &ptsh->cache[i]; + + if ( ent->cr3_mfn == (maddr >> PAGE_SHIFT) ) + return ent; + } + + return NULL; +} + /* * We only need to shadow 4-level PV guests. All other guests have per-vcpu * monitor tables which are never scheduled on concurrent pcpus. Care needs @@ -126,6 +204,7 @@ unsigned long pt_maybe_shadow(struct vcpu *v) unsigned int cpu = smp_processor_id(); struct pt_shadow *ptsh = &per_cpu(ptsh, cpu); unsigned long flags, new_cr3 = v->arch.cr3; + pt_cache_entry_t *ent; /* * IPIs for updates are based on the domain dirty mask. If we ever switch @@ -135,8 +214,12 @@ unsigned long pt_maybe_shadow(struct vcpu *v) if ( ptsh->domain && ptsh->domain != v->domain ) { + unsigned int i; + ptsh->domain = NULL; - ptsh->shadowing = 0; + + for ( i = 0; i < ARRAY_SIZE(ptsh->cache); ++i ) + ptsh->cache[i].cr3_mfn = 0; } /* No shadowing necessary? Run on the intended pagetable. */ @@ -145,10 +228,6 @@ unsigned long pt_maybe_shadow(struct vcpu *v) ptsh->domain = v->domain; - /* Fastpath, if we are already shadowing the intended pagetable. */ - if ( ptsh->shadowing == new_cr3 ) - return ptsh->shadow_l4; - /* * We may be called with interrupts disabled (e.g. context switch), or * interrupts enabled (e.g. new_guest_cr3()). @@ -158,14 +237,46 @@ unsigned long pt_maybe_shadow(struct vcpu *v) */ local_irq_save(flags); + ent = pt_cache_lookup(ptsh, new_cr3); + if ( ent ) + { + /* + * Cache hit. Promote this entry to being most recently used (if it + * isn't already). + */ + unsigned int cache_idx = ent - ptsh->cache; + + if ( cache_idx ) + { + pt_cache_entry_t tmp = *ent; + + switch ( cache_idx ) + { + case 3: ptsh->cache[3] = ptsh->cache[2]; + case 2: ptsh->cache[2] = ptsh->cache[1]; + case 1: ptsh->cache[1] = ptsh->cache[0]; + ptsh->cache[0] = tmp; + } + } + local_irq_restore(flags); + } + else { + /* + * Cache miss. Recycle whatever was in the last slot, promote it to + * being most recently used, and copy the entire pagetable. + */ unsigned int slot = l4_table_offset(PERCPU_LINEAR_START); + unsigned int idx = ptsh->cache[3].idx; l4_pgentry_t *l4t, *vcpu_l4t; - ptsh->shadowing = new_cr3; + ptsh->cache[3] = ptsh->cache[2]; + ptsh->cache[2] = ptsh->cache[1]; + ptsh->cache[1] = ptsh->cache[0]; + ptsh->cache[0] = (pt_cache_entry_t){ new_cr3 | idx }; local_irq_restore(flags); - l4t = ptsh->shadow_l4_va; + l4t = shadow_l4_va(ptsh, idx); vcpu_l4t = map_domain_page(maddr_to_mfn(new_cr3)); /* @@ -184,7 +295,9 @@ unsigned long pt_maybe_shadow(struct vcpu *v) unmap_domain_page(vcpu_l4t); } - return ptsh->shadow_l4; + ASSERT(ptsh->cache[0].cr3_mfn == (new_cr3 >> PAGE_SHIFT)); + + return shadow_l4(ptsh, ptsh->cache[0].idx); } struct ptsh_ipi_info @@ -193,6 +306,7 @@ struct ptsh_ipi_info const struct page_info *pg; enum { PTSH_IPI_WRITE, + PTSH_IPI_INVLPG, } op; unsigned int slot; }; @@ -202,29 +316,49 @@ static void _pt_shadow_ipi(void *arg) unsigned int cpu = smp_processor_id(); struct pt_shadow *ptsh = &per_cpu(ptsh, cpu); const struct ptsh_ipi_info *info = arg; - unsigned long maddr = page_to_maddr(info->pg); + pt_cache_entry_t *ent; /* No longer shadowing state from this domain? Nothing to do. */ if ( info->d != ptsh->domain ) return; + ent = pt_cache_lookup(ptsh, page_to_maddr(info->pg)); + /* Not shadowing this frame? Nothing to do. */ - if ( ptsh->shadowing != maddr ) + if ( ent == NULL ) return; switch ( info->op ) { l4_pgentry_t *l4t, *vcpu_l4t; + unsigned int cache_idx, shadow_idx; case PTSH_IPI_WRITE: - l4t = ptsh->shadow_l4_va; - vcpu_l4t = map_domain_page(maddr_to_mfn(maddr)); + l4t = shadow_l4_va(ptsh, ent->idx); + vcpu_l4t = map_domain_page(page_to_mfn(info->pg)); l4t[info->slot] = vcpu_l4t[info->slot]; unmap_domain_page(vcpu_l4t); break; + case PTSH_IPI_INVLPG: + cache_idx = ent - ptsh->cache; + shadow_idx = ent->idx; + + /* + * Demote the dropped entry to least-recently-used, so it is the next + * entry to be reused. + */ + switch ( cache_idx ) + { + case 0: BUG(); /* ??? Freeing the L4 which current is running on! */ + case 1: ptsh->cache[1] = ptsh->cache[2]; + case 2: ptsh->cache[2] = ptsh->cache[3]; + case 3: ptsh->cache[3] = (pt_cache_entry_t){ shadow_idx }; + } + break; + default: ASSERT_UNREACHABLE(); } @@ -248,6 +382,22 @@ void pt_shadow_l4_write(const struct domain *d, const struct page_info *pg, on_selected_cpus(d->domain_dirty_cpumask, _pt_shadow_ipi, &info, 1); } +void pt_shadow_l4_invlpg(const struct domain *d, const struct page_info *pg) +{ + struct ptsh_ipi_info info; + + if ( !pt_need_shadow(d) ) + return; + + info = (struct ptsh_ipi_info){ + .d = d, + .pg = pg, + .op = PTSH_IPI_INVLPG, + }; + + on_selected_cpus(d->domain_dirty_cpumask, _pt_shadow_ipi, &info, 1); +} + /* * Local variables: * mode: C diff --git a/xen/include/asm-x86/pv/pt-shadow.h b/xen/include/asm-x86/pv/pt-shadow.h index 6e71e99..d5576f4 100644 --- a/xen/include/asm-x86/pv/pt-shadow.h +++ b/xen/include/asm-x86/pv/pt-shadow.h @@ -47,6 +47,13 @@ unsigned long pt_maybe_shadow(struct vcpu *v); void pt_shadow_l4_write( const struct domain *d, const struct page_info *pg, unsigned int slot); +/* + * Called when an L4 pagetable is freed. The PT shadow logic ensures that it + * is purged from any caches. + */ +void pt_shadow_l4_invlpg( + const struct domain *d, const struct page_info *pg); + #else /* !CONFIG_PV */ static inline int pt_shadow_alloc(unsigned int cpu) { return 0; } @@ -58,6 +65,8 @@ static inline unsigned long pt_maybe_shadow(struct vcpu *v) } static inline void pt_shadow_l4_write( const struct domain *d, const struct page_info *pg, unsigned int slot) { } +static inline void pt_shadow_l4_invlpg( + const struct domain *d, const struct page_info *pg) { } #endif /* CONFIG_PV */ -- 2.1.4 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.