[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH RFC 24/44] x86/mapcache: Reimplement map_domain_page() from scratch
There are two reasons: 1) To stop using the per-domain range for the mapcache 2) To make map_domain_page() safe to use during context switches The new implementation is entirely percpu and rather more simple. See the comment at the top of domain_page.c for a description of the algorithm. A side effect of the new implementation is that we can get rid of struct mapcache_{vcpu,domain} entirely, and mapcache_override_current(). Signed-off-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> TODO: Consider whether to try and lazily unmap, utilising other TLB flush scenarios rather than forcing an invlpg on each unmap. --- xen/arch/x86/domain.c | 6 - xen/arch/x86/domain_page.c | 353 +++++++++++++------------------------------ xen/arch/x86/pv/dom0_build.c | 3 - xen/include/asm-x86/config.h | 7 - xen/include/asm-x86/domain.h | 42 ----- 5 files changed, 106 insertions(+), 305 deletions(-) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index 93e81c0..3d9e7fb 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -324,10 +324,6 @@ int vcpu_initialise(struct vcpu *v) v->arch.flags = TF_kernel_mode; - rc = mapcache_vcpu_init(v); - if ( rc ) - return rc; - if ( !is_idle_domain(d) ) { paging_vcpu_init(v); @@ -478,8 +474,6 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags, d->arch.emulation_flags = emflags; } - mapcache_domain_init(d); - HYPERVISOR_COMPAT_VIRT_START(d) = is_pv_domain(d) ? __HYPERVISOR_COMPAT_VIRT_START : ~0u; diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c index 8f2bcd4..c17ff66 100644 --- a/xen/arch/x86/domain_page.c +++ b/xen/arch/x86/domain_page.c @@ -18,291 +18,131 @@ #include <asm/hardirq.h> #include <asm/setup.h> -static DEFINE_PER_CPU(struct vcpu *, override); +/* + * Global mapcache entries are implemented using the vmap() infrastructure. + * + * Local mapcache entries are implemented with a percpu linear range, starting + * at PERCPU_MAPCACHE_START. The maximum number of concurrent mappings we + * expect to use (NR_MAPCACHE_SLOTS) is for a nested pagewalk. Being a small + * number, allocations are tracked with a simple bitmap (inuse). + * + * There is plenty of linear address space to use, so addresses are handed out + * by index into the inuse bitmap, with unmapped guard pages inbetween, to + * help catch bounds errors in the code using the mappings. + * + * It is *not* safe to pass local mapcache mappings to other CPUs to use. + */ -static inline struct vcpu *mapcache_current_vcpu(void) -{ - /* In the common case we use the mapcache of the running VCPU. */ - struct vcpu *v = this_cpu(override) ?: current; +struct mapcache_info { +#define NR_MAPCACHE_SLOTS (CONFIG_PAGING_LEVELS * CONFIG_PAGING_LEVELS) + unsigned long inuse; +}; +static DEFINE_PER_CPU(struct mapcache_info, mapcache_info); - /* - * When current isn't properly set up yet, this is equivalent to - * running in an idle vCPU (callers must check for NULL). - */ - if ( v == INVALID_VCPU ) - return NULL; +static unsigned long mapcache_idx_to_linear(unsigned int idx) +{ + return PERCPU_MAPCACHE_START + pfn_to_paddr(idx * 2 + 1); +} - /* - * When using efi runtime page tables, we have the equivalent of the idle - * domain's page tables but current may point at another domain's VCPU. - * Return NULL as though current is not properly set up yet. - */ - if ( efi_rs_using_pgtables() ) - return NULL; +static unsigned int mapcache_linear_to_idx(unsigned long linear) +{ + return paddr_to_pfn(linear - PERCPU_MAPCACHE_START) / 2; +} - /* - * If guest_table is NULL, and we are running a paravirtualised guest, - * then it means we are running on the idle domain's page table and must - * therefore use its mapcache. - */ - if ( unlikely(pagetable_is_null(v->arch.guest_table)) && is_pv_vcpu(v) ) - { - /* If we really are idling, perform lazy context switch now. */ - if ( (v = idle_vcpu[smp_processor_id()]) == current ) - sync_local_execstate(); - /* We must now be running on the idle page table. */ - ASSERT(read_cr3() == this_cpu(percpu_idle_pt)); - } +static l1_pgentry_t *mapcache_l1e(unsigned long linear) +{ + l1_pgentry_t *l1t = (l1_pgentry_t *)PERCPU_MAPCACHE_L1ES; - return v; + return &l1t[l1_table_offset(linear)]; } -void __init mapcache_override_current(struct vcpu *v) +/* + * Look up a mapcache entry, based on a linear address, ASSERT()ing that it is + * bounded senibly and in use. + */ +static l1_pgentry_t *lookup_inuse_mapcache_entry( + unsigned long linear, unsigned int *p_idx) { - this_cpu(override) = v; -} + unsigned int idx; + l1_pgentry_t *pl1e; -#define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER) -#define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1) -#define MAPCACHE_L1ENT(idx) \ - __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))] + ASSERT(linear >= PERCPU_MAPCACHE_START && linear < PERCPU_MAPCACHE_END); + + idx = mapcache_linear_to_idx(linear); + ASSERT(idx < NR_MAPCACHE_SLOTS); + ASSERT(test_bit(idx, &this_cpu(mapcache_info).inuse)); + + if ( p_idx ) + *p_idx = idx; + + pl1e = mapcache_l1e(linear); + ASSERT(l1e_get_flags(*pl1e) & _PAGE_PRESENT); + + return pl1e; +} void *map_domain_page(mfn_t mfn) { - unsigned long flags; - unsigned int idx, i; - struct vcpu *v; - struct mapcache_domain *dcache; - struct mapcache_vcpu *vcache; - struct vcpu_maphash_entry *hashent; + unsigned long flags, linear; + unsigned int idx; + struct mapcache_info *mci = &this_cpu(mapcache_info); + l1_pgentry_t *pl1e; #ifdef NDEBUG if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) ) return mfn_to_virt(mfn_x(mfn)); #endif - v = mapcache_current_vcpu(); - if ( !v || !is_pv_vcpu(v) ) + if ( this_cpu(curr_extended_directmap) ) return mfn_to_virt(mfn_x(mfn)); - dcache = &v->domain->arch.pv_domain.mapcache; - vcache = &v->arch.pv_vcpu.mapcache; - if ( !dcache->inuse ) - return mfn_to_virt(mfn_x(mfn)); - - perfc_incr(map_domain_page_count); - + /* + * map_domain_page() is used from many contexts, including fault handlers. + * Disable interrupts to keep the inuse bitmap consistent with the l1t. + * + * Be aware! Any #PF inside this region will most likely recurse with the + * spurious pagefault handler until the BUG_ON() is hit. + */ local_irq_save(flags); - hashent = &vcache->hash[MAPHASH_HASHFN(mfn_x(mfn))]; - if ( hashent->mfn == mfn_x(mfn) ) - { - idx = hashent->idx; - ASSERT(idx < dcache->entries); - hashent->refcnt++; - ASSERT(hashent->refcnt); - ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(idx)) == mfn_x(mfn)); - goto out; - } - - spin_lock(&dcache->lock); - - /* Has some other CPU caused a wrap? We must flush if so. */ - if ( unlikely(dcache->epoch != vcache->shadow_epoch) ) - { - vcache->shadow_epoch = dcache->epoch; - if ( NEED_FLUSH(this_cpu(tlbflush_time), dcache->tlbflush_timestamp) ) - { - perfc_incr(domain_page_tlb_flush); - flush_tlb_local(); - } - } + idx = find_first_zero_bit(&mci->inuse, NR_MAPCACHE_SLOTS); + BUG_ON(idx == NR_MAPCACHE_SLOTS); - idx = find_next_zero_bit(dcache->inuse, dcache->entries, dcache->cursor); - if ( unlikely(idx >= dcache->entries) ) - { - unsigned long accum = 0, prev = 0; - - /* /First/, clean the garbage map and update the inuse list. */ - for ( i = 0; i < BITS_TO_LONGS(dcache->entries); i++ ) - { - accum |= prev; - dcache->inuse[i] &= ~xchg(&dcache->garbage[i], 0); - prev = ~dcache->inuse[i]; - } - - if ( accum | (prev & BITMAP_LAST_WORD_MASK(dcache->entries)) ) - idx = find_first_zero_bit(dcache->inuse, dcache->entries); - else - { - /* Replace a hash entry instead. */ - i = MAPHASH_HASHFN(mfn_x(mfn)); - do { - hashent = &vcache->hash[i]; - if ( hashent->idx != MAPHASHENT_NOTINUSE && !hashent->refcnt ) - { - idx = hashent->idx; - ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(idx)) == hashent->mfn); - l1e_write(&MAPCACHE_L1ENT(idx), l1e_empty()); - hashent->idx = MAPHASHENT_NOTINUSE; - hashent->mfn = ~0UL; - break; - } - if ( ++i == MAPHASH_ENTRIES ) - i = 0; - } while ( i != MAPHASH_HASHFN(mfn_x(mfn)) ); - } - BUG_ON(idx >= dcache->entries); - - /* /Second/, flush TLBs. */ - perfc_incr(domain_page_tlb_flush); - flush_tlb_local(); - vcache->shadow_epoch = ++dcache->epoch; - dcache->tlbflush_timestamp = tlbflush_current_time(); - } + __set_bit(idx, &mci->inuse); - set_bit(idx, dcache->inuse); - dcache->cursor = idx + 1; + linear = mapcache_idx_to_linear(idx); + pl1e = mapcache_l1e(linear); - spin_unlock(&dcache->lock); + ASSERT(!(l1e_get_flags(*pl1e) & _PAGE_PRESENT)); + *pl1e = l1e_from_mfn(mfn, __PAGE_HYPERVISOR_RW); + barrier(); /* Ensure the pagetable is updated before enabling interrupts. */ - l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_mfn(mfn, __PAGE_HYPERVISOR_RW)); - - out: local_irq_restore(flags); - return (void *)MAPCACHE_VIRT_START + pfn_to_paddr(idx); + + return (void *)linear; } void unmap_domain_page(const void *ptr) { + struct mapcache_info *mci = &this_cpu(mapcache_info); + unsigned long flags, linear = (unsigned long)ptr; unsigned int idx; - struct vcpu *v; - struct mapcache_domain *dcache; - unsigned long va = (unsigned long)ptr, mfn, flags; - struct vcpu_maphash_entry *hashent; + l1_pgentry_t *pl1e; - if ( va >= DIRECTMAP_VIRT_START ) + if ( linear >= DIRECTMAP_VIRT_START ) return; - ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END); - - v = mapcache_current_vcpu(); - ASSERT(v && is_pv_vcpu(v)); - - dcache = &v->domain->arch.pv_domain.mapcache; - ASSERT(dcache->inuse); - - idx = PFN_DOWN(va - MAPCACHE_VIRT_START); - mfn = l1e_get_pfn(MAPCACHE_L1ENT(idx)); - hashent = &v->arch.pv_vcpu.mapcache.hash[MAPHASH_HASHFN(mfn)]; + pl1e = lookup_inuse_mapcache_entry(linear, &idx); local_irq_save(flags); - if ( hashent->idx == idx ) - { - ASSERT(hashent->mfn == mfn); - ASSERT(hashent->refcnt); - hashent->refcnt--; - } - else if ( !hashent->refcnt ) - { - if ( hashent->idx != MAPHASHENT_NOTINUSE ) - { - /* /First/, zap the PTE. */ - ASSERT(l1e_get_pfn(MAPCACHE_L1ENT(hashent->idx)) == - hashent->mfn); - l1e_write(&MAPCACHE_L1ENT(hashent->idx), l1e_empty()); - /* /Second/, mark as garbage. */ - set_bit(hashent->idx, dcache->garbage); - } - - /* Add newly-freed mapping to the maphash. */ - hashent->mfn = mfn; - hashent->idx = idx; - } - else - { - /* /First/, zap the PTE. */ - l1e_write(&MAPCACHE_L1ENT(idx), l1e_empty()); - /* /Second/, mark as garbage. */ - set_bit(idx, dcache->garbage); - } + *pl1e = l1e_empty(); + asm volatile ( "invlpg %0" :: "m" (*(char *)ptr) : "memory" ); + __clear_bit(idx, &mci->inuse); local_irq_restore(flags); } -int mapcache_domain_init(struct domain *d) -{ - struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache; - unsigned int bitmap_pages; - - if ( !is_pv_domain(d) || is_idle_domain(d) ) - return 0; - -#ifdef NDEBUG - if ( !mem_hotplug && max_page <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) ) - return 0; -#endif - - BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 + - 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) > - MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20)); - bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long)); - dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE; - dcache->garbage = dcache->inuse + - (bitmap_pages + 1) * PAGE_SIZE / sizeof(long); - - spin_lock_init(&dcache->lock); - - return create_perdomain_mapping(d, (unsigned long)dcache->inuse, - 2 * bitmap_pages + 1, - NIL(l1_pgentry_t *), NULL); -} - -int mapcache_vcpu_init(struct vcpu *v) -{ - struct domain *d = v->domain; - struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache; - unsigned long i; - unsigned int ents = d->max_vcpus * MAPCACHE_VCPU_ENTRIES; - unsigned int nr = PFN_UP(BITS_TO_LONGS(ents) * sizeof(long)); - - if ( !is_pv_vcpu(v) || !dcache->inuse ) - return 0; - - if ( ents > dcache->entries ) - { - /* Populate page tables. */ - int rc = create_perdomain_mapping(d, MAPCACHE_VIRT_START, ents, - NIL(l1_pgentry_t *), NULL); - - /* Populate bit maps. */ - if ( !rc ) - rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse, - nr, NULL, NIL(struct page_info *)); - if ( !rc ) - rc = create_perdomain_mapping(d, (unsigned long)dcache->garbage, - nr, NULL, NIL(struct page_info *)); - - if ( rc ) - return rc; - - dcache->entries = ents; - } - - /* Mark all maphash entries as not in use. */ - BUILD_BUG_ON(MAPHASHENT_NOTINUSE < MAPCACHE_ENTRIES); - for ( i = 0; i < MAPHASH_ENTRIES; i++ ) - { - struct vcpu_maphash_entry *hashent = &v->arch.pv_vcpu.mapcache.hash[i]; - - hashent->mfn = ~0UL; /* never valid to map */ - hashent->idx = MAPHASHENT_NOTINUSE; - } - - return 0; -} - void *map_domain_page_global(mfn_t mfn) { ASSERT(!in_irq() && @@ -345,10 +185,29 @@ unsigned long domain_page_map_to_mfn(const void *ptr) BUG_ON(!pl1e); } else - { - ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END); - pl1e = &__linear_l1_table[l1_linear_offset(va)]; - } + pl1e = lookup_inuse_mapcache_entry(va, NULL); return l1e_get_pfn(*pl1e); } + +static __init __maybe_unused void build_assertions(void) +{ + struct mapcache_info info; + + /* NR_MAPCACHE_SLOTS within the bounds of the inuse bitmap? */ + BUILD_BUG_ON(NR_MAPCACHE_SLOTS > (sizeof(info.inuse) * 8)); + + /* Enough linear address space, including guard pages? */ + BUILD_BUG_ON((NR_MAPCACHE_SLOTS * 2) > + (PERCPU_MAPCACHE_END - PERCPU_MAPCACHE_START)); +} + +/* + * Local variables: + * mode: C + * c-file-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/xen/arch/x86/pv/dom0_build.c b/xen/arch/x86/pv/dom0_build.c index 09c765a..3baf37b 100644 --- a/xen/arch/x86/pv/dom0_build.c +++ b/xen/arch/x86/pv/dom0_build.c @@ -698,7 +698,6 @@ int __init dom0_construct_pv(struct domain *d, /* We run on dom0's page tables for the final part of the build process. */ write_ptbase(v); - mapcache_override_current(v); /* Copy the OS image and free temporary buffer. */ elf.dest_base = (void*)vkern_start; @@ -717,7 +716,6 @@ int __init dom0_construct_pv(struct domain *d, if ( (parms.virt_hypercall < v_start) || (parms.virt_hypercall >= v_end) ) { - mapcache_override_current(NULL); write_ptbase(current); printk("Invalid HYPERCALL_PAGE field in ELF notes.\n"); rc = -1; @@ -838,7 +836,6 @@ int __init dom0_construct_pv(struct domain *d, xlat_start_info(si, XLAT_start_info_console_dom0); /* Return to idle domain's page tables. */ - mapcache_override_current(NULL); write_ptbase(current); update_domain_wallclock_time(d); diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h index a95f8c8..f78cbde 100644 --- a/xen/include/asm-x86/config.h +++ b/xen/include/asm-x86/config.h @@ -314,13 +314,6 @@ extern unsigned long xen_phys_start; #define LDT_VIRT_START(v) \ (GDT_VIRT_START(v) + (64*1024)) -/* map_domain_page() map cache. The second per-domain-mapping sub-area. */ -#define MAPCACHE_VCPU_ENTRIES (CONFIG_PAGING_LEVELS * CONFIG_PAGING_LEVELS) -#define MAPCACHE_ENTRIES (MAX_VIRT_CPUS * MAPCACHE_VCPU_ENTRIES) -#define MAPCACHE_VIRT_START PERDOMAIN_VIRT_SLOT(1) -#define MAPCACHE_VIRT_END (MAPCACHE_VIRT_START + \ - MAPCACHE_ENTRIES * PAGE_SIZE) - /* Argument translation area. The third per-domain-mapping sub-area. */ #define ARG_XLAT_VIRT_START PERDOMAIN_VIRT_SLOT(2) /* Allow for at least one guard page (COMPAT_ARG_XLAT_SIZE being 2 pages): */ diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h index f699119..fa57c93 100644 --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -38,42 +38,6 @@ struct trap_bounce { unsigned long eip; }; -#define MAPHASH_ENTRIES 8 -#define MAPHASH_HASHFN(pfn) ((pfn) & (MAPHASH_ENTRIES-1)) -#define MAPHASHENT_NOTINUSE ((u32)~0U) -struct mapcache_vcpu { - /* Shadow of mapcache_domain.epoch. */ - unsigned int shadow_epoch; - - /* Lock-free per-VCPU hash of recently-used mappings. */ - struct vcpu_maphash_entry { - unsigned long mfn; - uint32_t idx; - uint32_t refcnt; - } hash[MAPHASH_ENTRIES]; -}; - -struct mapcache_domain { - /* The number of array entries, and a cursor into the array. */ - unsigned int entries; - unsigned int cursor; - - /* Protects map_domain_page(). */ - spinlock_t lock; - - /* Garbage mappings are flushed from TLBs in batches called 'epochs'. */ - unsigned int epoch; - u32 tlbflush_timestamp; - - /* Which mappings are in use, and which are garbage to reap next epoch? */ - unsigned long *inuse; - unsigned long *garbage; -}; - -int mapcache_domain_init(struct domain *); -int mapcache_vcpu_init(struct vcpu *); -void mapcache_override_current(struct vcpu *); - /* x86/64: toggle guest between kernel and user modes. */ void toggle_guest_mode(struct vcpu *); /* x86/64: toggle guest page tables between kernel and user modes. */ @@ -251,9 +215,6 @@ struct pv_domain atomic_t nr_l4_pages; - /* map_domain_page() mapping cache. */ - struct mapcache_domain mapcache; - struct cpuidmasks *cpuidmasks; }; @@ -444,9 +405,6 @@ struct arch_domain struct pv_vcpu { - /* map_domain_page() mapping cache. */ - struct mapcache_vcpu mapcache; - struct trap_info *trap_ctxt; unsigned long gdt_frames[FIRST_RESERVED_GDT_PAGE]; -- 2.1.4 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |